## Novel dataset web scraper

Gathers novel information from novelupdates, http://www.novelupdates.com/,
then cleans the data and arrange everything into a dataset.
The dataset is finally saved as a csv file.

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

novel_list_page = "http://www.novelupdates.com/novelslisting/?st=1&pg="
novel_page = "http://www.novelupdates.com/?p="

# For testing
novels_max_pages = 2

There do not seem to be an easy way to get all novel ids. Therefore, these are gathered from existing list of novels. First the maximum number of novel pages is retrieved and then the novels on these are iterated to get the ids.

In [None]:
# Get the maximum number of pages with novel
def get_novel_list_max_pages(page):
    soup = BeautifulSoup(page.content, 'html.parser')
    dig_pag = soup.find('div', attrs={'class':'digg_pagination'})
    page_links = dig_pag.find_all('a')
    last_page_link = str(page_links[2]) # The last page is the 3rd
    num = re.search('pg=\d+', last_page_link).group()[3:]
    return int(num)

page = requests.get(novel_list_page + '1')
novels_max_pages = get_novel_list_max_pages(page)
print("Pages with novels: " + str(novels_max_pages))


# Get all novel ids from the novel lists
def get_novel_ids(page):
    soup = BeautifulSoup(page.content, 'html.parser')
    table = soup.find('table', attrs={'id':'myTable'})
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    
    novel_ids = []
    for row in rows:
        col = row.find_all('td')[-1]
        novel_id = col.a['id'][3:]
        novel_ids.append(novel_id)
    return novel_ids

all_novel_ids = []
for i in range(1,novels_max_pages+1):
    page = requests.get(novel_list_page + str(i))
    novel_ids = get_novel_ids(page)
    all_novel_ids.extend(novel_ids)
    time.sleep(1)

df = pd.DataFrame(all_novel_ids, columns=['id'])

In [None]:
#Parsing help functions

def get_value(element, check=lambda e: e.string, func=lambda e: e.string.strip()):
    if check(element) is None:
        return None
    return func(element)

def get_bool(string):
    '''possible values is Yes, No, N/A'''
    if string == "Yes":
        return True
    elif string == "No":
        return False
    else:
        return None

In [75]:
def general_info(page, )
# return a dict ??

def publisher_info(page, )

def chapter_info(page, )

def release_info(page, )

def community_info(page, )
# both rating / community

def relation_info(page, )

def parse_novel_page(id_num):
    page = requests.get(novel_page + str(id_num))
    soup = BeautifulSoup(page.content, 'html.parser')
    content = soup.find('div', attrs={'class': 'w-blog-content'})
    if content is None:
        return pd.Series() 
    data = {}
    
    #TODO
    print(id_num)
    
    # general information
    data['id'] = int(id_num)
    data['name'] = get_value(content.div)
    data['assoc_names'] = list(content.find('div', attrs={'id': 'editassociated'}).stripped_strings)
    data['original_langauge'] = get_value(content.find('div', attrs={'id': 'showlang'}), 
                                          lambda e: e.a, 
                                          lambda e: e.text.strip().lower())
    data['authors'] = [author.text.lower()
                for author in content
                  .find('div', attrs={'id': 'showauthors'})
                  .find_all('a')]
    data['genres'] = [genre.text.lower()
                for genre in content
                  .find('div', attrs={'id': 'seriesgenre'})
                  .find_all('a', attrs={'class': 'genre'})]
    data['tags'] = [tag.text.lower()
                for tag in content
                  .find('div', attrs={'id': 'showtags'})
                  .find_all('a')]
    
    # publisher
    data['start_year'] = get_value(content.find('div', attrs={'id': 'edityear'}),)
    data['licensed'] = get_bool(get_value(content.find('div', attrs={'id': 'showlicensed'})))
    data['original_publisher'] = get_value(content.find('div', attrs={'id': 'showopublisher'}),
                                               lambda e: e.a, 
                                               lambda e: e.a.string.strip().lower())
    data['english_publisher'] = get_value(content.find('div', attrs={'id': 'showepublisher'}),
                                              lambda e: e.a, 
                                              lambda e: e.a.string.strip().lower())
    
    # chapters
    chapter_status = get_value(content.find('div', attrs={'id': 'editstatus'}))
    if chapter_status is not None:
        data['complete_original'] = 'complete' in chapter_status.lower()
        chapter_current = re.search('(.*)\(', chapter_status).group(1).strip()
        data['chapters_original_current'] = chapter_current if chapter_current != "" else None 
    data['complete_translated'] = get_bool(get_value(content.find('div', attrs={'id': 'showtranslated'})))
    
    table = soup.find('table', attrs={'id': 'myTable'})
    if table is not None:
        release_table = table.find('tbody')
        data['chapter_latest_translated'] = release_table.find('tr').find_all('td')[2].a.string.strip()
        
    # current release activity
    release_freq = content.find('h5', attrs={'class': 'seriesother'}, string='Release Frequency').next_sibling
    activity = content.find_all('span', attrs={'class': 'userrate rank'})
    data['release_freq'] = float(re.search('\d+\.?\d*', release_freq).group(0))
    data['activity_week_rank'] = int(activity[0].string[1:])
    data['activity_month_rank'] = int(activity[1].string[1:])
    data['activity_all_time_rank'] = int(activity[2].string[1:])
    
    # community
    data['on_reading_lists'] = int(content.find('b', attrs={'class': 'rlist'}).string)
    data['reading_list_month_rank'] = int(activity[3].string[1:])
    data['reading_list_all_time_rank'] = int(activity[4].string[1:])
    
    # rating
    rating_text = content.find('span', attrs={'class': 'uvotes'}).text.split(' ')
    data['rating'] = float(rating_text[0][1:])
    data['rating_votes'] = int(rating_text[3])
    
    # relations
    any_related = content.find('h5', attrs={'class': 'seriesother'}, string='Related Series').next_sibling
    if "N/A" not in any_related:
        related_series_first = any_related.next_sibling.get('id')[3:]
        data['related_series_ids'] = [related_series_first]

    data['recommended_series_ids'] = []    
    for series in soup.find_all('a', attrs={'class': 'genre'}, recursive=False):
        if series.has_attr('title'):
            data['recommended_series_ids'].append(series.get('id')[3:])
        else:
            data['related_series_ids'].append(series.get('id')[3:])
    time.sleep(1)
    return pd.Series(data)

In [76]:
df = pd.merge(df, df.id.apply(lambda x: parse_novel_page(x)), left_index=True, right_index=True)
df = df.id.apply(lambda x: parse_novel_page(x))
print(df.head)
#df.to_csv('novels.csv', header=True, index=False)

1173
2 Volumes
16166
42 Chapters
14845
None
1175
2 Volumes


AttributeError: 'NoneType' object has no attribute 'group'