In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Step 1: Get the HTML content of the website

* `response = requests.get(url)`: 获取网页数据
* `response.encoding = 'gbk'`: 设置编码格式（中文）
* `soup = BeautifulSoup(...)`: 把网页传给Beatiful Soup

* `print(soup.prettify())`: 清理网页并查看格式

In [9]:
url_base = 'http://www.oesz.cn/'
categories = {'睡前故事': url_base + 'shuiqian/',
              '童话故事': url_base + 'tonghua/',
              '寓言故事': url_base + 'yuyan/',
              '成语故事': url_base + 'chengyu/',
              '哲理故事': url_base + 'zheli/',
              '故事大全': url_base + 'gushidaquan/',}

url = categories['睡前故事']
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')

print(soup.prettify())


<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="webkit" name="renderer">
   <meta content="webkit" name="force-rendering"/>
   <meta content="IE=Edge,chrome=1" http-equiv="X-UA-Compatible"/>
   <meta content="width=device-width, initial-scale=1.0, user-scalable=0, minimal-ui" name="viewport"/>
   <meta content="order by www.oesz.cn" name="author"/>
   <meta content="no-transform" http-equiv="Cache-Control">
    <meta content="no-siteapp" http-equiv="Cache-Control"/>
    <meta content="pc,mobile" name="applicable-device"/>
    <meta content="width" name="MobileOptimized"/>
    <meta content="true" name="HandheldFriendly"/>
    <meta content="all" name="Robots"/>
    <title>
     幼儿睡前故事_宝宝睡前小故事_儿童睡前小故事_故事大全网
    </title>
    <meta content="" name="keywords"/>
    <meta content="" name="description"/>
    <link href="/style/css/index.css" rel="stylesheet"/>
    <link href="/style/css/so.css" rel="stylesheet"/>
   </meta>
  </meta>
 </head>
 <body>
  <d

## Step 2.1: Find where stories are stored

```html
 <a href="/shuiqian/11141.html" target="_blank">
    太阳月亮和乌鸦
</a>

<span>
2021-03-31
</span>
```

* `suffix = url.strip('/').split('/')[-1]`: 获取后缀
* `titles = soup.find_all('a', href=...`: 找到所有\<a>并且href以后缀开头的部分
* `titles[i].get_text().strip()`: 获取titles列表里第i个内容的**文字**，并且去除前后空格

In [17]:
suffix = url.strip('/').split('/')[-1]
titles = soup.find_all('a', href=lambda x: x and x.startswith(f'/{suffix}/'))
print(suffix)
titles

shuiqian


[<a href="/shuiqian/">睡前小故事</a>,
 <a href="/shuiqian/" target="_blank">睡前小故事</a>,
 <a href="/shuiqian/11151.html" target="_blank">沙漏的启示_睡前故事</a>,
 <a href="/shuiqian/11150.html" target="_blank">酒爷爷与手套奶奶</a>,
 <a href="/shuiqian/11149.html" target="_blank">南瓜哪儿去了_睡前故事</a>,
 <a href="/shuiqian/11148.html" target="_blank">蜘蛛夫妇的裁缝店</a>,
 <a href="/shuiqian/11147.html" target="_blank">知错能改才能进步</a>,
 <a href="/shuiqian/11146.html" target="_blank">吉米猫分蛋糕_睡前故事</a>,
 <a href="/shuiqian/11145.html" target="_blank">菜园内的故事_睡前故事</a>,
 <a href="/shuiqian/11144.html" target="_blank">会动的房子_睡前故事</a>,
 <a href="/shuiqian/11143.html" target="_blank">狐狸先生的责任</a>,
 <a href="/shuiqian/11142.html" target="_blank">小黑熊打针_睡前故事</a>,
 <a href="/shuiqian/11141.html" target="_blank">太阳月亮和乌鸦</a>,
 <a href="/shuiqian/11140.html" target="_blank">神奇听诊器_睡前故事</a>,
 <a href="/shuiqian/11139.html" target="_blank">割草比赛_睡前故事</a>,
 <a href="/shuiqian/11138.html" target="_blank">同一个朋友_故事网</a>,
 <a href="/shuiqian/11137.html" t

## Step 2.2: Find where indexes are stored
If you didn't see the url to the index (usually number, or "首页", "末页", "下一页") in previous section, locate those indexes. 

```html
<div class="pages">
     <ul>
      <li>
       首页
      </li>
      <li class="thisclass">
       1
      </li>

      ...

       <a href="list_1_2.html">
        下一页
       </a>
      </li>
      <li>
       <a href="list_1_139.html">
        末页
       </a>
      </li>
     </ul>
    </div>
   </div>
  </div>
```


In [18]:
index = soup.find_all('a', href=lambda x: x and x.startswith(f'list_'))
index

[<a href="list_1_2.html">2</a>,
 <a href="list_1_3.html">3</a>,
 <a href="list_1_2.html">下一页</a>,
 <a href="list_1_139.html">末页</a>]

## Step 3: collect story titles in current page
获取本页的所有故事标题和url。

In [48]:
ignore = ['睡前小故事']
category = '睡前故事'
# This store all titles and their links
# format: {'story title': （'url/to/story', 'category') }
titles_dict = {}

for title in titles:
    if title.text in ignore:
        continue
    titles_dict[title.text.strip()] = url_base + title['href'][1:]


titles_dict['太阳月亮和乌鸦']

'http://www.oesz.cn/shuiqian/11141.html'

定位下一页的url，如果不存在，`next_page = None`。

In [49]:
next_page = None
for i in index:
    if i.get_text().strip() == "下一页":
        next_page = i['href']
        break
next_page = url + next_page
next_page

'http://www.oesz.cn/shuiqian/list_1_2.html'

查看前5个爬取的数据：
* `titles_dict.keys()`：故事标题
* `titles_dict.values()`：故事url

In [53]:
for i in range(5):
    title = list(titles_dict.keys())[i]
    url = titles_dict[title]

    print(f"{title}: {url}")

沙漏的启示_睡前故事: http://www.oesz.cn/shuiqian/11151.html
酒爷爷与手套奶奶: http://www.oesz.cn/shuiqian/11150.html
南瓜哪儿去了_睡前故事: http://www.oesz.cn/shuiqian/11149.html
蜘蛛夫妇的裁缝店: http://www.oesz.cn/shuiqian/11148.html
知错能改才能进步: http://www.oesz.cn/shuiqian/11147.html


### 去除重复部分
我们的爬取策略是找到所有以"/shuiqian"开头的\<a>分区。但网页里“最新发布”区的故事一样符合我们的条件会被爬取。这会造成“最新发布”里的故事被反复爬取

我们的方法是将“最新发布”区的故事名存在一个表中，每次爬取时如果故事名重复则不添加。




```html
 <div class="cont">
 <div class="infohd">
 <div class="info_tab"> <span class="on">最新发布</span> <b class="line"></b> </div>
 </div>
 <div class="index_list clearfix">
 <ul class="clearfix">
 <li><span>2021-03-31</span><a href="/shuiqian/11151.html" target="_blank">沙漏的启示_睡前故事</a></li>
 <li><span>2021-03-31</span><a href="/shuiqian/11150.html" target="_blank">酒爷爷与手套奶奶</a></li>
 ...
```

我们可以找到“最新发布”在`soup`(网页代码)里的位置，然后定位所有在它之后的故事标题

In [78]:
recent = soup.find('span', string='最新发布').find_all_next('a', href=lambda x: x and x.startswith(f'/{suffix}/'))

In [79]:
recent_dict = {}
for title in recent:
    if title.text in ignore:
        continue
    titles_dict[title.text.strip()] = url_base + title['href'][1:]
    recent_dict[title.text.strip()] = url_base + title['href'][1:]

## Step 4: collect story titles in every subsequent pages
获取每一页的标题和网址。
这里用“下一页”是否存在作为代码的结束条件

In [None]:
## 当下一页不为空时
while next_page:
    ## 重复Step 1-2
    response = requests.get(next_page)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')

In [82]:
## Step 1
response = requests.get(next_page)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
## Step 2.1
suffix = url.strip('/').split('/')[-1]
titles = soup.find_all('a', href=lambda x: x and x.startswith(f'/{suffix}/'))
## Step 2.2
index = soup.find_all('a', href=lambda x: x and x.startswith(f'list_'))
titles


[]

In [85]:
url

'http://www.oesz.cn/shuiqian/11147.html'

In [None]:
category = '睡前故事'

## Step 1
url = categories[next_page]
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
## Step 2.1
suffix = url.strip('/').split('/')[-1]
titles = soup.find_all('a', href=lambda x: x and x.startswith(f'/{suffix}/'))
## Step 2.2
index = soup.find_all('a', href=lambda x: x and x.startswith(f'list_'))


ignore = ['睡前小故事']
# This store all titles and their links
# format: {'story title': （'url/to/story', 'category') }
titles_dict = {}

for title in titles:
    if title.text in ignore:
        continue
    titles_dict[title.text.strip()] = url_base + title['href'][1:]

next_page = None
for i in index:
    if i.get_text().strip() == "下一页":
        next_page = i['href']
        break
next_page = url + next_page

