## Novel dataset web scraper

Gathers novel information from novelupdates, http://www.novelupdates.com/,
then cleans the data and arrange everything into a dataset.
The dataset is finally saved as a csv file.

In [68]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

novel_list_page = "http://www.novelupdates.com/novelslisting/?st=1&pg="
novel_page = "http://www.novelupdates.com/?p="

There do not seem to be an easy way to get all novel ids. Therefore, these are gathered from existing list of novels. First the maximum number of novel pages is retrieved and then the novels on these are iterated to get the ids.

In [69]:
# Get the maximum number of pages with novel
def get_novel_list_max_pages(page):
    soup = BeautifulSoup(page.content, 'html.parser')
    dig_pag = soup.find('div', attrs={'class':'digg_pagination'})
    page_links = dig_pag.find_all('a')
    last_page_link = str(page_links[2]) # The last page is the 3rd
    num = re.search('pg=\d+', last_page_link).group()[3:]
    return int(num)

page = requests.get(novel_list_page + '1')
novels_max_pages = get_novel_list_max_pages(page)
print("Pages with novels: " + str(novels_max_pages))

# For testing
novels_max_pages = 2

# Get all novel ids from the novel lists
def get_novel_ids(page):
    soup = BeautifulSoup(page.content, 'html.parser')
    table = soup.find('table', attrs={'id':'myTable'})
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    
    novel_ids = []
    for row in rows:
        col = row.find_all('td')[-1]
        novel_id = col.a['id'][3:]
        novel_ids.append(novel_id)
    return novel_ids

all_novel_ids = []
for i in range(1,novels_max_pages+1):
    page = requests.get(novel_list_page + str(i))
    novel_ids = get_novel_ids(page)
    all_novel_ids.extend(novel_ids)
    time.sleep(1)

df = pd.DataFrame(all_novel_ids, columns=['id'])

Pages with novels: 155


In [71]:
def parse_novel_page(id_num):
    page = requests.get(novel_page + str(id_num))
    soup = BeautifulSoup(page.content, 'html.parser')
    content = soup.find('div', attrs={'class': 'w-blog-content'})
    if content is None:
        return pd.Series() 
    data = {}
    
    #TODO
    print(id_num)
    
    # general information
    data['id'] = int(id_num)
    data['name'] = content.div.string.strip()
    data['assoc_names'] = list(content
                               .find('div', attrs={'id': 'editassociated'})
                               .stripped_strings)
    data['authors'] = [author.text.lower()
                for author in content
                  .find('div', attrs={'id': 'showauthors'})
                  .findAll('a')]
    data['org_langauge'] = content.find('div', attrs={'id': 'showlang'}).a.text
    data['genres'] = [genre.text.lower()
                for genre in content
                  .find('div', attrs={'id': 'seriesgenre'})
                  .find_all('a', attrs={'class': 'genre'})]
    data['tags'] = [tag.text.lower()
                for tag in content
                  .find('div', attrs={'id': 'showtags'})
                  .find_all('a')]
    
    # publisher
    data['start_year'] = int(content.find('div', attrs={'id': 'edityear'}).string.strip())
    data['licensed'] = True if content.find('div', attrs={'id': 'showlicensed'}).string.strip() == 'Yes' else False
    data['original_publisher'] = content.find('div', attrs={'id': 'showopublisher'}).a.string.strip().lower()
    data['english_publisher'] = content.find('div', attrs={'id': 'showepublisher'}).a.string.strip().lower()

    # chapters
    chapter_status = content.find('div', attrs={'id': 'editstatus'}).string
    chapter_table = soup.find('table', attrs={'id': 'myTable'}).find('tbody')
    complete_translated = content.find('div', attrs={'id': 'showtranslated'}).a.string.strip()
    
    data['chapters_original_current'] = int(re.search('\d+ Chapters', chapter_status).group(0).split(' ')[0])
    data['chapter_latest_translated'] = chapter_table.find('tr').find_all('td')[2].a.string.strip()
    data['complete_original'] = 'complete' in chapter_status.string.lower()
    data['complete_translated'] = True if  complete_translated == 'Yes' else False
    
    # current release activity
    release_freq = content.find('h5', attrs={'class': 'seriesother'}, string='Release Frequency').next_sibling
    activity = content.find_all('span', attrs={'class': 'userrate rank'})
    data['release_freq'] = float(re.search('\d+\.?\d*', release_freq).group(0))
    data['activity_week_rank'] = int(activity[0].string[1:])
    data['activity_month_rank'] = int(activity[1].string[1:])
    data['activity_all_time_rank'] = int(activity[2].string[1:])
    
    # community
    data['on_reading_lists'] = int(content.find('b', attrs={'class': 'rlist'}).string)
    data['reading_list_month_rank'] = int(activity[3].string[1:])
    data['reading_list_all_time_rank'] = int(activity[4].string[1:])
    
    # rating
    rating_text = content.find('span', attrs={'class': 'uvotes'}).text.split(' ')
    data['rating'] = float(rating_text[0][1:])
    data['rating_votes'] = int(rating_text[3])
    
    # relations
    related_series_first = content.find('h5', attrs={'class': 'seriesother'}, string='Related Series')\
        .next_sibling.next_sibling.get('id')[3:]
    related_series_others = []
    recommended_series_ids = []
    for series in soup.find_all('a', attrs={'class': 'genre'}, recursive=False):
        if series.has_attr('title'):
            recommended_series_ids.append(series.get('id')[3:])
        else:
            related_series_others.append(series.get('id')[3:])
    data['related_series_ids'] = [related_series_first] + related_series_others
    data['recommended_series_ids'] = recommended_series_ids
    
    time.sleep(1)
    return pd.Series(data)

#df = parse_novel_page('http://www.novelupdates.com/series/i-shall-seal-the-heavens/')

df = pd.merge(df, df.id.apply(lambda x: parse_novel_page(x)), left_index=True, right_index=True)
df = df.id.apply(lambda x: parse_novel_page(x))
print(df.head)
#df.to_csv('novels.csv', header=True, index=False)

1173


AttributeError: 'NoneType' object has no attribute 'find'