# Novel dataset web scraper

Gathers novel information from novelupdates, http://www.novelupdates.com/,
then cleans the data and arrange everything into a dataset.
The dataset is finally saved as a csv file.

In [37]:
import requests
import time
import re
import pandas as pd
from bs4 import BeautifulSoup

# Display all columns when showing dataframes
pd.set_option('display.max_columns', None)

novel_list_page = "http://www.novelupdates.com/novelslisting/?st=1&pg="
novel_page = "http://www.novelupdates.com/?p="

There do not seem to be an easy way to get all novel ids. Therefore, these are gathered from existing list of novels. First the maximum number of novel pages is retrieved and then the novels on these are iterated to get the ids.

In [38]:
# Get the maximum number of pages with novels
def get_novel_list_max_pages(page):
    soup = BeautifulSoup(page.content, 'html.parser')
    dig_pag = soup.find('div', attrs={'class':'digg_pagination'})
    max_page = max([int(a.text) for a in dig_pag.find_all('a') if a.text.isdigit()])
    return max_page

# Get all novel ids from a single page
def get_novel_ids(page):
    soup = BeautifulSoup(page.content, 'html.parser')
    table = soup.find('div', attrs={'class':'w-blog-content other'})
    novels = table.find_all('div', attrs={'class': 'search_title'})
    novel_ids = [novel.find('span', attrs={'class': 'rl_icons_en'}).get('id')[3:] for novel in novels]
    novel_ids = [int(n) for n in novel_ids]
    return novel_ids


page = requests.get(novel_list_page + '1')


# TOOD: For testing - only use 2 pages for now.
novels_max_pages = 1
#novels_max_pages = get_novel_list_max_pages(page)


print("Pages with novels: " + str(novels_max_pages))

all_novel_ids = []
for i in range(1,novels_max_pages+1):
    page = requests.get(novel_list_page + str(i))
    novel_ids = get_novel_ids(page)
    all_novel_ids.extend(novel_ids)
    time.sleep(1)

df = pd.DataFrame(all_novel_ids, columns=['id'])

Pages with novels: 1


In [39]:
def get_value(element, check=lambda e: e.string, parse=lambda e: e.string.strip()):
    if check(element) is None:
        return None
    pe = parse(element)
    if ''.join(pe) == 'N/A':
        return None
    return pe

              
def get_value_str_txt(element, check_str=lambda e: e.string, parse_str=lambda e: e.string.strip(),
                      check_txt=lambda e: e.text, parse_txt=lambda e: e.text.strip()):
    res_str = get_value(element, check_str, parse_str)
    res_txt = get_value(element, check_txt, parse_txt)
    return res_str or res_txt
              

def empty(element):
    return get_value(element) == ""


def get_bool(string):
    '''possible values is Yes, No, N/A'''
    if string == "Yes":
        return True
    elif string == "No":
        return False
    else:
        return None

In [40]:
def general_info(content):
    gen_info = {}
    gen_info['name'] = get_value(content.div)
    gen_info['assoc_names'] = get_value(content.find('div', attrs={'id': 'editassociated'}), 
                                        check=lambda e: e, parse=lambda e: list(e.stripped_strings))
    gen_info['original_langauge'] = get_value(content.find('div', attrs={'id': 'showlang'}), 
                                          lambda e: e.a, 
                                          lambda e: e.text.strip().lower())
    gen_info['authors'] = [author.text.lower()
                for author in content
                  .find('div', attrs={'id': 'showauthors'})
                  .find_all('a')]
    gen_info['genres'] = [genre.text.lower()
                for genre in content
                  .find('div', attrs={'id': 'seriesgenre'})
                  .find_all('a', attrs={'class': 'genre'})]
    gen_info['tags'] = [tag.text.lower()
                for tag in content
                  .find('div', attrs={'id': 'showtags'})
                  .find_all('a')]
    return gen_info


def publisher_info(content):
    pub_info = {}
    pub_info['start_year'] = get_value(content.find('div', attrs={'id': 'edityear'}),)
    pub_info['licensed'] = get_bool(get_value(content.find('div', attrs={'id': 'showlicensed'})))
    pub_info['original_publisher'] = get_value(content.find('div', attrs={'id': 'showopublisher'}),
                                               lambda e: e.a, 
                                               lambda e: e.a.string.strip().lower())
    pub_info['english_publisher'] = get_value(content.find('div', attrs={'id': 'showepublisher'}),
                                              lambda e: e.a, 
                                              lambda e: e.a.string.strip().lower())
    return pub_info


def chapter_info(soup, content):
    chap_info = {}
    chapter_status = get_value_str_txt(content.find('div', attrs={'id': 'editstatus'}))    
    if chapter_status is not None:    
        chap_info['complete_original'] = 'complete' in chapter_status.lower()
        chapter_current = re.search('([^\+\(])+', chapter_status).group(1).strip()
        chap_info['chapters_original_current'] = chapter_current if chapter_current != "" else None 
    chap_info['complete_translated'] = get_bool(get_value(content.find('div', attrs={'id': 'showtranslated'})))
    
    table = soup.find('table', attrs={'id': 'myTable'})
    if table is not None:
        release_table = table.find('tbody')
        chap_info['chapter_latest_translated'] = release_table.find('tr').find_all('td')[2].a.string.strip()
    return chap_info
    
    
def release_info(content):
    rel_info = {}
    release_freq = content.find('h5', attrs={'class': 'seriesother'}, string='Release Frequency').next_sibling
    activity = content.find_all('span', attrs={'class': 'userrate rank'})
    
    if not empty(release_freq):
        rel_info['release_freq'] = float(re.search('\d+\.?\d*', release_freq).group(0))
        
    rel_info['activity_week_rank'] = int(activity[0].string[1:])
    rel_info['activity_month_rank'] = int(activity[1].string[1:])
    rel_info['activity_all_time_rank'] = int(activity[2].string[1:])
    return rel_info
    

def community_info(content):
    comm_info = {}
    activity = content.find_all('span', attrs={'class': 'userrate rank'})
    comm_info['on_reading_lists'] = int(content.find('b', attrs={'class': 'rlist'}).string)
    comm_info['reading_list_month_rank'] = int(activity[3].string[1:])
    comm_info['reading_list_all_time_rank'] = int(activity[4].string[1:])
    
    # rating
    rating_text = content.find('span', attrs={'class': 'uvotes'}).text.split(' ')
    comm_info['rating'] = float(rating_text[0][1:])
    comm_info['rating_votes'] = int(rating_text[3])
    return comm_info
    
    
def relation_info(soup, content):
    rel_info = {}
    any_related = content.find('h5', attrs={'class': 'seriesother'}, string='Related Series').next_sibling
    if "N/A" not in any_related:
        related_series_first = any_related.next_sibling.get('id')[3:]
        rel_info['related_series_ids'] = [related_series_first]

    rel_info['recommended_series_ids'] = []    
    for series in soup.find_all('a', attrs={'class': 'genre'}, recursive=False):
        if series.has_attr('title'):
            rel_info['recommended_series_ids'].append(series.get('id')[3:])
        else:
            rel_info['related_series_ids'].append(series.get('id')[3:])
    return rel_info

In [41]:
def parse_novel_page(id_num):
    page = requests.get(novel_page + str(id_num))
    soup = BeautifulSoup(page.content, 'html.parser')
    content = soup.find('div', attrs={'class': 'w-blog-content'})
    if content is None:
        return pd.Series() 
    data = {'id': id_num}
    
    #TODO
    print(id_num)
    
    data.update(general_info(content))
    data.update(publisher_info(content))
    data.update(chapter_info(soup, content))
    data.update(release_info(content))
    data.update(community_info(content))
    data.update(relation_info(soup, content))
    
    time.sleep(1)
    return pd.Series(data)



df = df['id'].apply(lambda x: parse_novel_page(x))
df = df.set_index('id')
print(df.head)
#df.to_csv('novels.csv', header=True, index=False)

2151
10802
20789
24953
24068
2989
5732
5964
6363
24100
15752
23324
22007
24334
24856
20793
24656
6341
24512
20994
3766
8212
22604
21371
20728
<bound method NDFrame.head of        activity_all_time_rank  activity_month_rank  activity_week_rank  \
id                                                                       
2151                     2077                  942                 944   
10802                     128                   65                  69   
20789                    1454                  243                 301   
24953                    4362                 1224                 451   
24068                    3940                 1217                1877   
2989                     1585                 1950                2196   
5732                      721                  972                1260   
5964                     1055                 1026                 806   
6363                        2                    8                   7   
24100         