# Novel dataset web scraper

Gathers novel information from novelupdates, http://www.novelupdates.com/,
then cleans the data and arrange everything into a dataset.
The dataset is finally saved as a csv file.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

novel_list_page = "http://www.novelupdates.com/novelslisting/?st=1&pg="
novel_page = "http://www.novelupdates.com/?p="

# For testing
novels_max_pages = 2

There do not seem to be an easy way to get all novel ids. Therefore, these are gathered from existing list of novels. First the maximum number of novel pages is retrieved and then the novels on these are iterated to get the ids.

In [2]:
# Get the maximum number of pages with novel
def get_novel_list_max_pages(page):
    soup = BeautifulSoup(page.content, 'html.parser')
    dig_pag = soup.find('div', attrs={'class':'digg_pagination'})
    page_links = dig_pag.find_all('a')
    last_page_link = str(page_links[2]) # The last page is the 3rd
    num = re.search('pg=\d+', last_page_link).group()[3:]
    return int(num)

page = requests.get(novel_list_page + '1')
novels_max_pages = get_novel_list_max_pages(page)
print("Pages with novels: " + str(novels_max_pages))


# Get all novel ids from the novel lists
def get_novel_ids(page):
    soup = BeautifulSoup(page.content, 'html.parser')
    table = soup.find('table', attrs={'id':'myTable'})
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    
    novel_ids = []
    for row in rows:
        col = row.find_all('td')[-1]
        novel_id = col.a['id'][3:]
        novel_ids.append(novel_id)
    return novel_ids

all_novel_ids = []
for i in range(1,novels_max_pages+1):
    page = requests.get(novel_list_page + str(i))
    novel_ids = get_novel_ids(page)
    all_novel_ids.extend(novel_ids)
    time.sleep(1)

df = pd.DataFrame(all_novel_ids, columns=['id'])

Pages with novels: 200


## Parsing help functions

In [39]:
def get_value(element, check=lambda e: e.string, parse=lambda e: e.string.strip()):
    if check(element) is None:
        return None
    pe = parse(element)
    if ''.join(pe) == 'N/A':
        return None
    return pe

              
def get_value_str_txt(element, check_str=lambda e: e.string, parse_str=lambda e: e.string.strip(),
                      check_txt=lambda e: e.text, parse_txt=lambda e: e.text.strip()):
    res_str = get_value(element, check_str, parse_str)
    res_txt = get_value(element, check_txt, parse_txt)
    return res_str or res_txt
              

def empty(element):
    return get_value(element) == ""


def get_bool(string):
    '''possible values is Yes, No, N/A'''
    if string == "Yes":
        return True
    elif string == "No":
        return False
    else:
        return None

In [42]:
def general_info(content):
    gen_info = {}
    gen_info['name'] = get_value(content.div)
    gen_info['assoc_names'] = get_value(content.find('div', attrs={'id': 'editassociated'}), 
                                        check=lambda e: e, parse=lambda e: list(e.stripped_strings))
    gen_info['original_langauge'] = get_value(content.find('div', attrs={'id': 'showlang'}), 
                                          lambda e: e.a, 
                                          lambda e: e.text.strip().lower())
    gen_info['authors'] = [author.text.lower()
                for author in content
                  .find('div', attrs={'id': 'showauthors'})
                  .find_all('a')]
    gen_info['genres'] = [genre.text.lower()
                for genre in content
                  .find('div', attrs={'id': 'seriesgenre'})
                  .find_all('a', attrs={'class': 'genre'})]
    gen_info['tags'] = [tag.text.lower()
                for tag in content
                  .find('div', attrs={'id': 'showtags'})
                  .find_all('a')]
    return gen_info


def publisher_info(content):
    pub_info = {}
    pub_info['start_year'] = get_value(content.find('div', attrs={'id': 'edityear'}),)
    pub_info['licensed'] = get_bool(get_value(content.find('div', attrs={'id': 'showlicensed'})))
    pub_info['original_publisher'] = get_value(content.find('div', attrs={'id': 'showopublisher'}),
                                               lambda e: e.a, 
                                               lambda e: e.a.string.strip().lower())
    pub_info['english_publisher'] = get_value(content.find('div', attrs={'id': 'showepublisher'}),
                                              lambda e: e.a, 
                                              lambda e: e.a.string.strip().lower())
    return pub_info


def chapter_info(soup, content):
    chap_info = {}
    chapter_status = get_value_str_txt(content.find('div', attrs={'id': 'editstatus'}))    
    if chapter_status is not None:    
        chap_info['complete_original'] = 'complete' in chapter_status.lower()
        chapter_current = re.search('([^\+\(])+', chapter_status).group(1).strip()
        chap_info['chapters_original_current'] = chapter_current if chapter_current != "" else None 
    chap_info['complete_translated'] = get_bool(get_value(content.find('div', attrs={'id': 'showtranslated'})))
    
    table = soup.find('table', attrs={'id': 'myTable'})
    if table is not None:
        release_table = table.find('tbody')
        chap_info['chapter_latest_translated'] = release_table.find('tr').find_all('td')[2].a.string.strip()
    return chap_info
    
    
def release_info(content):
    rel_info = {}
    release_freq = content.find('h5', attrs={'class': 'seriesother'}, string='Release Frequency').next_sibling
    activity = content.find_all('span', attrs={'class': 'userrate rank'})
    
    if not empty(release_freq):
        rel_info['release_freq'] = float(re.search('\d+\.?\d*', release_freq).group(0))
        
    rel_info['activity_week_rank'] = int(activity[0].string[1:])
    rel_info['activity_month_rank'] = int(activity[1].string[1:])
    rel_info['activity_all_time_rank'] = int(activity[2].string[1:])
    return rel_info
    

def community_info(content):
    comm_info = {}
    activity = content.find_all('span', attrs={'class': 'userrate rank'})
    comm_info['on_reading_lists'] = int(content.find('b', attrs={'class': 'rlist'}).string)
    comm_info['reading_list_month_rank'] = int(activity[3].string[1:])
    comm_info['reading_list_all_time_rank'] = int(activity[4].string[1:])
    
    # rating
    rating_text = content.find('span', attrs={'class': 'uvotes'}).text.split(' ')
    comm_info['rating'] = float(rating_text[0][1:])
    comm_info['rating_votes'] = int(rating_text[3])
    return comm_info
    
    
def relation_info(soup, content):
    rel_info = {}
    any_related = content.find('h5', attrs={'class': 'seriesother'}, string='Related Series').next_sibling
    if "N/A" not in any_related:
        related_series_first = any_related.next_sibling.get('id')[3:]
        rel_info['related_series_ids'] = [related_series_first]

    rel_info['recommended_series_ids'] = []    
    for series in soup.find_all('a', attrs={'class': 'genre'}, recursive=False):
        if series.has_attr('title'):
            rel_info['recommended_series_ids'].append(series.get('id')[3:])
        else:
            rel_info['related_series_ids'].append(series.get('id')[3:])
    return rel_info

In [43]:
def parse_novel_page(id_num):
    page = requests.get(novel_page + str(id_num))
    soup = BeautifulSoup(page.content, 'html.parser')
    content = soup.find('div', attrs={'class': 'w-blog-content'})
    if content is None:
        return pd.Series() 
    data = {}
    data['id'] = int(id_num)
    
    #TODO
    print(id_num)
    
    data.update(general_info(content))
    data.update(publisher_info(content))
    data.update(chapter_info(soup, content))
    data.update(release_info(content))
    data.update(community_info(content))
    data.update(relation_info(soup, content))
    
    time.sleep(1)
    return pd.Series(data)

df = pd.merge(df, df.id.apply(lambda x: parse_novel_page(x)), left_index=True, right_index=True)
df = df.id.apply(lambda x: parse_novel_page(x))
print(df.head)
#df.to_csv('novels.csv', header=True, index=False)

1173
<div id="editassociated">ドットハック エーアイバスター</div>
['ドットハック エーアイバスター']
['ドットハック エーアイバスター']
16166
<div id="editassociated"><span class="seriesna">N/A</span></div>
N/A
None
['N/A']
14845
<div id="editassociated"><span class="seriesna">N/A</span></div>
N/A
None
['N/A']
1175
<div id="editassociated"><span class="seriesna">N/A</span></div>
N/A
None
['N/A']
1177
<div id="editassociated"><span class="seriesna">N/A</span></div>
N/A
None
['N/A']
13038
<div id="editassociated">.hack//新約碑文</div>
['.hack//新約碑文']
['.hack//新約碑文']
2042
<div id="editassociated">ぼくたちと駐在さんの700日戦争<br>ぼくちゅう
</br></div>
['ぼくたちと駐在さんの700日戦争', 'ぼくちゅう']
['ぼくたちと駐在さんの700日戦争', 'ぼくちゅう']
6612
<div id="editassociated">Tenseishichatta Yo (Iya, Gomen)<br>転生しちゃったよ （いや、ごめん）</br></div>
['Tenseishichatta Yo (Iya, Gomen)', '転生しちゃったよ （いや、ごめん）']
['Tenseishichatta Yo (Iya, Gomen)', '転生しちゃったよ （いや、ごめん）']
22136
<div id="editassociated">I Was Told That I Wouldn’t Be Able To Defeat The Demon Lord And Was Kicked Out Of The Hero’s Party, So Now I J

9838
<div id="editassociated">99番目の吸血姫　～最後の吸血鬼～</div>
['99番目の吸血姫\u3000～最後の吸血鬼～']
['99番目の吸血姫\u3000～最後の吸血鬼～']
1187
<div id="editassociated">9S Memories<br>Nine S<br>ナインエス</br></br></div>
['9S Memories', 'Nine S', 'ナインエス']
['9S Memories', 'Nine S', 'ナインエス']
16193
<div id="editassociated">ABSCAY<br>亿万星辰不及你</br></div>
['ABSCAY', '亿万星辰不及你']
['ABSCAY', '亿万星辰不及你']
2310
<div id="editassociated">Time Will Never Go Back<br>那些回不去的年少时光</br></div>
['Time Will Never Go Back', '那些回不去的年少时光']
['Time Will Never Go Back', '那些回不去的年少时光']
16801
<div id="editassociated">一只拥有红包群的猫</div>
['一只拥有红包群的猫']
['一只拥有红包群的猫']
17164
</br></div>itassociated">Aru Erufu no Shuki<br>或るエルフの手記
['Aru Erufu no Shuki', '或るエルフの手記']
['Aru Erufu no Shuki', '或るエルフの手記']
829
<div id="editassociated">OVRMMO<br>To Aru Ossan no VRMMO Katsudouki<br>とあるおっさんのＶＲＭＭＯ活動記</br></br></div>
['OVRMMO', 'To Aru Ossan no VRMMO Katsudouki', 'とあるおっさんのＶＲＭＭＯ活動記']
['OVRMMO', 'To Aru Ossan no VRMMO Katsudouki', 'とあるおっさんのＶＲＭＭＯ活動記']
21237
<div id="editassociated

14479
<div id="editassociated">New Game! From the Bottom of the World<br>Saiteihen kara nyuu geemu! ~ Aete dorei ni natte isekai wo jitsuryoku dake de nukeagarimasu ~<br>最底辺からニューゲーム! ~あえて奴隷になって異世界を実力だけで駆け上がります~<br>～奴隷商人は次に地位と名誉と無垢な少女を手に入れます～</br></br></br></div>
['New Game! From the Bottom of the World', 'Saiteihen kara nyuu geemu! ~ Aete dorei ni natte isekai wo jitsuryoku dake de nukeagarimasu ~', '最底辺からニューゲーム! ~あえて奴隷になって異世界を実力だけで駆け上がります~', '～奴隷商人は次に地位と名誉と無垢な少女を手に入れます～']
['New Game! From the Bottom of the World', 'Saiteihen kara nyuu geemu! ~ Aete dorei ni natte isekai wo jitsuryoku dake de nukeagarimasu ~', '最底辺からニューゲーム! ~あえて奴隷になって異世界を実力だけで駆け上がります~', '～奴隷商人は次に地位と名誉と無垢な少女を手に入れます～']
18755
<div id="editassociated">完美战神</div>
['完美战神']
['完美战神']
20518
<div id="editassociated">POBE
<br>总裁离魂小记</br></div>
['POBE', '总裁离魂小记']
['POBE', '总裁离魂小记']
13576
<div id="editassociated">牧师传奇</div>
['牧师传奇']
['牧师传奇']
N/A
22376
</br></div>itassociated">ARTT<br>群雄逐鹿
['ARTT', '群雄逐鹿']
['ARTT', '群雄逐鹿']
2765
<div

AttributeError: 'NoneType' object has no attribute 'stripped_strings'