In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Step 1: Get the HTML content of the website

* `response = requests.get(url)`: 获取网页数据
* `response.encoding = 'gbk'`: 设置编码格式（中文）
* `soup = BeautifulSoup(...)`: 把网页传给Beatiful Soup

* `print(soup.prettify())`: 清理网页并查看格式

In [2]:
categories = {'童话故事': 'http://www.wpwx.cn/news/tonghua/',
              '儿童故事': 'http://www.wpwx.cn/news/gushi/',
              '神话故事': 'http://www.wpwx.cn/news/shenhua/',}

url = categories['儿童故事']
response = requests.get(url)
response.encoding = 'gbk'
soup = BeautifulSoup(response.text, 'html.parser')

print(soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 7 ]><html class="ie ie6" lang="zh-cmn-Hans"> <![endif]-->
<!--[if IE 7 ]><html class="ie ie7" lang="zh-cmn-Hans"> <![endif]-->
<!--[if IE 8 ]><html class="ie ie8" lang="zh-cmn-Hans"> <![endif]-->
<!--[if (gte IE 9)|!(IE)]><!-->
<html lang="zh-cmn-Hans">
 <!--<![endif]-->
 <head>
  <meta content="IE=edge,Chrome=1" http-equiv="X-UA-Compatible"/>
  <meta content="text/html;charset=utf-8" http-equiv="content-type"/>
  <!-- Your Basic Site Informations -->
  <title>
   儿童故事　儿童故事大全　儿童故事集锦　少儿故事大全－中国儿童文学网
  </title>
  <meta content="中国儿童文学网　带你进入文学殿堂！,儿童故事　儿童故事大全　儿童故事集锦　少儿故事大全" name="description"/>
  <meta content="中国儿童文学网,儿童故事　儿童故事大全　儿童故事集锦　少儿故事大全" name="keywords"/>
  <meta content="Copyright 2004 -中国儿童文学网.,Inc" name="copyright"/>
  <meta content="ZERGE" name="author">
   <style type="text/css">
    BODY {FONT-SIZE: 12pt}
TH {FONT-SIZE: 12pt}
TD {FONT-SIZE: 12pt}
   </style>
   <style>
    <!--
A:link {text-decoration: none; color: black}
A:visited {text-decoratio

## Step2: Find where stories are stored

```html
<p align="left">
    <img height="7" src="/Templets/Images/arrow_01.gif" width="5"/>
        <a href="/news/gushi/22102810472D8G9J0BADGBK60CHE2G2.htm" target="_blank">
            聪明的猴子
        </a>
</p>
```
童话故事以“/news/gushi/”开头：
* `suffix = url.strip('/').split('/')[-1]`: 获取后缀("gushi")
* `stories = soup.find_all('a', href=...`: 找到所有\<a>并且href以"/news/gushi"开头的部分
* `stories[i].get_text().strip()`: 获取stories列表里第i个内容的**文字**，并且去除前后空格

In [3]:
suffix = url.strip('/').split('/')[-1]
stories = soup.find_all('a', href=lambda x: x and x.startswith(f'/news/{suffix}/'))
stories

[<a href="/news/gushi/">儿童故事 <i class="arrow-main-nav"></i></a>,
 <a href="/news/gushi/">儿童故事</a>,
 <a href="/news/gushi/221028105247CIKK2J7E56KGCK5AJE25.htm" target="_blank">云端上的恐怖分子</a>,
 <a href="/news/gushi/22102810472D8G9J0BADGBK60CHE2G2.htm" target="_blank">聪明的猴子</a>,
 <a href="/news/gushi/2298171619JE44GG8B97EJ280EB090.htm" target="_blank">月光精灵</a>,
 <a href="/news/gushi/2298171619JE44G0EJG1ADAK7J6AJG.htm" target="_blank">云朵车</a>,
 <a href="/news/gushi/2298171619JE44G1KG271D795J4KJ8.htm" target="_blank">神奇的魔棒</a>,
 <a href="/news/gushi/2298171619JE44GGFH175BI486C6CC.htm" target="_blank">卖梦店</a>,
 <a href="/news/gushi/2298171619JE44GJ229FG060FG6J6E.htm" target="_blank">隐身衣</a>,
 <a href="/news/gushi/2298171619JE44G960D2E5K6F34HF0.htm" target="_blank">小猪波波的蘑菇汤</a>,
 <a href="/news/gushi/2298171619JE44GB0G6C0E1275IDK1.htm" target="_blank">狮子回赠的礼物</a>,
 <a href="/news/gushi/2298171619JE44GG63DGDC8EBJ8014.htm" target="_blank">小袋鼠的袋袋</a>,
 <a href="/news/gushi/2298171619JE44G3DJ7JC8J7

In [4]:
stories[3].get_text().strip()

'聪明的猴子'

## Step3: collect all the story titles and corresponding url
获取标题，网址

In [18]:
# {'story title': （'url/to/story', 'category') }
titles = {}
ignore = ['童话故事', '中国童话故事', '儿童故事', '神话故事', '中国神话传说故事']

for category, url in categories.items():
    response = requests.get(url)
    response.encoding = 'gbk'
    soup = BeautifulSoup(response.text, 'html.parser')

    suffix = url.strip('/').split('/')[-1]
    ## 获取故事标题
    stories = soup.find_all('a', href=lambda x: x and x.startswith(f'/news/{suffix}/'))
    

    # {'index': 'url/to/index'}
    # 储存每一页的url
    index = {}
    # 当前页数
    page_number = 1

    ## 当下一页不为空时
    while page_number == 1 or next_page is not None:
        print('Page:', page_number)

        isIndex = False
        for story in stories:
            text = story.get_text().strip()
            if text in ignore:
                #print('Ignoring:', text)
                continue
            if text.isdigit():
                isIndex = True
                if text not in index:
                    index[text] = 'http://www.wpwx.cn' + story['href']
            else:
                if text not in titles:
                    titles[text] = ('http://www.wpwx.cn' + story['href'], category)
        
            # exit after reading the indexes
            if isIndex and not text.isdigit():
                break

            #print(text)
            #print('http://www.wpwx.cn' + story['href'])

        # print(index.values())
        page_number += 1
        next_page = index.get(str(page_number))

        if next_page is None or page_number > 1000:
            break

        response = requests.get(next_page)
        response.encoding = 'gbk'
        soup = BeautifulSoup(response.text, 'html.parser')

        stories = soup.find_all('a', href=lambda x: x and x.startswith(f'/news/{suffix}/'))

Page: 1
Page: 2
Page: 3
Page: 4
Page: 5
Page: 6
Page: 7
Page: 8
Page: 9
Page: 10
Page: 11
Page: 12
Page: 13
Page: 14
Page: 15
Page: 16
Page: 17
Page: 18
Page: 1
Page: 2
Page: 3
Page: 4
Page: 5
Page: 6
Page: 7
Page: 8
Page: 9
Page: 10
Page: 11
Page: 12
Page: 13
Page: 14
Page: 15
Page: 16
Page: 17
Page: 18
Page: 19
Page: 20
Page: 21
Page: 22
Page: 23
Page: 24
Page: 25
Page: 26
Page: 27
Page: 28
Page: 29
Page: 30
Page: 31
Page: 32
Page: 33
Page: 34
Page: 35
Page: 36
Page: 1
Page: 2
Page: 3
Page: 4
Page: 5


In [21]:
for title in titles.keys():
    print(f"{title}: {titles[title]}")

梦幻童话《橙子国王与巧克力屋》: ('http://www.wpwx.cn/news/tonghua/22318144942K96GA9CA24FFD13GG4AJ.htm', '童话故事')
羊妈妈和她的好心邻居们: ('http://www.wpwx.cn/news/tonghua/211118135030JB3JJ12HJJEICE37KDC7.htm', '童话故事')
小鸟开花店: ('http://www.wpwx.cn/news/tonghua/211118122043D8A42D779703BHG85810.htm', '童话故事')
小猫和公鸡: ('http://www.wpwx.cn/news/tonghua/21111812191118EHB25A883DAH26J30J.htm', '童话故事')
小白兔和小青蛙: ('http://www.wpwx.cn/news/tonghua/2111181217327C35E425H16C6510B5JD.htm', '童话故事')
贪吃甜食的小熊: ('http://www.wpwx.cn/news/tonghua/2111181215584DG5AI012J9899I9BEIB.htm', '童话故事')
猫和老鼠: ('http://www.wpwx.cn/news/tonghua/21111812141A800EF3BG8F99H9EI0B9.htm', '童话故事')
狐狸与乌鸦（续写）: ('http://www.wpwx.cn/news/tonghua/211118121216G2HBJDFHC99CGBJFI7HD.htm', '童话故事')
公主与玫瑰: ('http://www.wpwx.cn/news/tonghua/211118121038HFEGJC9G9K9FK2KF9F4E.htm', '童话故事')
聪明的小公鸡: ('http://www.wpwx.cn/news/tonghua/2111181284197DD0JDF442F609G10J5.htm', '童话故事')
蟾蜍美容: ('http://www.wpwx.cn/news/tonghua/2111181271403C6CBK9296D6IAJ44FF.htm', '童话故事')
小黄鱼讲故事（第四部  西

## Step 4: get text of the stories

In [5]:
title = '鲤鱼跳龙门'
story_url = titles[title][0]
response = requests.get(story_url)
response.encoding = 'gbk'
soup = BeautifulSoup(response.text, 'html.parser')

NameError: name 'titles' is not defined

In [24]:
## find index of the story title, and index + 1 is the story
lst = soup.find_all('p')

for idx, string in enumerate(lst):
    if title in string.get_text().strip():
        story_idx = idx
        break

lst[story_idx+1].get_text().strip()

'来源：中国民间故事网\u3000\u3000作者：佚名\n\xa0\xa0\xa0 庙峡，又名妙峡。两座巍峨雄奇的凤凰大山，拔水擎天，夹江而立，引人入胜的鲤鱼跳龙门，活灵活现，雄奇壮观。进入峡谷，两山雄峙，悬崖叠垒，峭壁峥嵘，壁峰刺天；奇特的岩花，依壁竞开，把峡谷装缀成仙境一般。这个神奇美妙的峡谷，流传着一个优美动人的故事。\r\n\t\xa0\xa0\xa0 在很早以前，龙溪河畔的乡民，男耕女织，过着安居乐业的美满生活。一年，不知从哪儿飞来一条大黄孽龙，作恶多端。它不是呼风唤雨破坏庄稼，就是吞云吐雾残害生灵，把整个峡谷搞得乌烟瘴气，不得安宁。每年六月六日它的生日这天，更是强迫人们献上一对童男童女和十头大黄牛，一百头猪、羊等物供它享用。如若不然，它就发怒作恶，张开血盆大口，窜上村庄吞噬人畜，破坏田园，害得宁河黎民怨声载道，叫苦连天。\r\n\t\xa0\xa0\xa0 峡口龙溪镇上，有一位聪明俊美的小姑娘，名叫玉姑，她下决心，非除掉这条恶龙不可。有几次，她登上云台观去找云台仙子求救，都未找着。她仍不灰心，继续去找。这天清晨，她登上云台观，仙子被玉姑心诚志坚的精神感动了，就出现在她眼前，向她指点说：“离这儿千里之外有个鲤鱼洞，你可前去会见一位鲤鱼仙子，她定能相助于你。”\r\n\t\xa0\xa0\xa0 玉姑辞别云台仙子，跋山涉水，历尽千辛万苦，来到鲤鱼洞中，找到鲤鱼仙子，说明来意。鲤鱼仙子对玉姑说：“你想为民除害，这是件大好事，可是必须牺牲你自己啊！你能这样做吗？”玉姑毫不犹豫地说：“只要是为乡亲们除害，消灭那恶龙，哪怕是上刀山，下火海，粉身碎骨我也心甘！”鲤鱼仙子见玉姑这样诚恳坚决，十分满意地点了点头，朝玉姑喷了三口白泉，她顿时变成了一条美丽刚劲的红鲤鱼。 \r\n\t\xa0\xa0\xa0 小红鲤逆江而上，经过七七四十九天，游回家乡。这天正是六月六日清晨，她摇身变还原貌，见乡亲们已准备就绪：一对童男童女，十头大黄牛，一百头肥羊肥猪。人们敲锣打鼓，宛如一条长龙向祭黄龙的峡口走来，前面那一对身着红衣红裙的童男童女，早已哭成泪人了。 \r\n\t\xa0\xa0\xa0 黄龙见百姓送到盛餐佳肴，早已垂涎三尺，得意地张开大口。就在这千钓一发之时，玉姑抢先上前，拦住父老乡亲们说道：“大家在此暂停等着，让我前去收拾这个害人精。”话刚说完，只见玉姑纵身跳下水中，霎时变成

In [25]:
nrow = 0
df = pd.DataFrame(columns=['title', 'text', 'category', 'story_url'])
current_category = None
for title in titles.keys():
    story_url = titles[title][0]
    category = titles[title][1]

    if category != current_category:
        print(f"Category: {category}")
        current_category = category
    if nrow % 20 == 0:
        print(f"Processing story {nrow}...")

    response = requests.get(story_url)
    response.encoding = 'gbk'
    soup = BeautifulSoup(response.text, 'html.parser')

    ## find index of the story title, and index + 1 is the story
    lst = soup.find_all('p')

    idx = None
    for idx, string in enumerate(lst):
        if title in string.get_text().strip():
            story_idx = idx
            break
    if idx is None:
        print(f"Title not found: {title}, index: {nrow}")
        nrow += 1
        continue
    text = lst[idx+1].get_text()

    # save title, text and story_url to dataframe
    df.loc[nrow] = [title, text, category, story_url]
    nrow += 1

print(f"{nrow} stories saved to dataframe.")


Processing story 0...
Category: 童话故事
Processing story 20...
Processing story 40...
Processing story 60...
Processing story 80...
Processing story 100...
Processing story 120...
Processing story 140...
Processing story 160...
Processing story 180...
Processing story 200...
Processing story 220...
Processing story 240...
Processing story 260...
Processing story 280...
Processing story 300...
Processing story 320...
Processing story 340...
Category: 儿童故事
Processing story 360...
Processing story 380...
Processing story 400...
Processing story 420...
Processing story 440...
Processing story 460...
Processing story 480...
Processing story 500...
Processing story 520...
Processing story 540...
Processing story 560...
Processing story 580...
Processing story 600...
Processing story 620...
Processing story 640...
Processing story 660...
Processing story 680...
Processing story 700...
Processing story 720...
Processing story 740...
Processing story 760...
Processing story 780...
Processing story

In [26]:
df.to_csv('stories.csv', index=False)