## 创建爬虫程序

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib import parse
import sqlite3
import re

ignore_words = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it'])

In [2]:
class crawler:
    def __init__(self, db_name):
        self.db = sqlite3.connect(db_name)

    def __del__(self):
        self.db.close()

    # 只有在commit之后，修改操作才会正式生效
    def db_commit(self):
        self.db.commit()

    # 在table中查找field中为values的项，有则返回rowid，没有则插入新项
    def get_entry_id(self, table, field, values, create=True):
        cur = self.db.execute('''SELECT rowid
                                FROM %s
                                WHERE %s='%s' ''' % (table, field, values))
        res = cur.fetchone()
        if res is None:
            cur = self.db.execute('''INSERT INTO %s(%s) VALUES ('%s')''' % (table, field, values))
            return cur.lastrowid
        else:
            return res[0]

    # 建立网页和单词之间的索引
    def add_to_index(self, url, soup):
        if self.is_indexed(url): return
        print('Indexing %s .' % url)

        text = self.get_text(soup)
        words = self.split_text(text)

        url_id = self.get_entry_id('url_list', 'url', url)

        for i in range(len(words)):
            word = words[i]
            if word in ignore_words: continue
            word_id = self.get_entry_id('word_list', 'word', word)
            # 网页id-单词id-单词在网页中的位置
            self.db.execute(
                '''INSERT INTO word_location(url_id, word_id, location) values (%d, %d, %d)''' % (url_id, word_id, i))

    # 获取soup的纯文本（无html标签）
    def get_text(self, soup):
        # soup只有一个标签时，可以用string获取字符串；有多个标签时返回None
        v = soup.string
        if v is None:
            # soup有多个标签
            c = soup.contents
            result_text = ''
            for t in c:
                # 递归提取纯文本
                sub_text = self.get_text(t)
                result_text += sub_text + '\n'
            return result_text
        else:
            return v.strip()

    # 将纯文本按照非字母划分
    def split_text(self, text):
        splitter = re.compile('\\W*')  # ??? # ps:词干提取：porter stemmer
        return [s.lower() for s in splitter.split(text) if s != '']

    # 链接是否已经被索引，且与url与单词的关系被记录
    def is_indexed(self, url):
        u = self.db.execute('''SELECT rowid
                            FROM url_list
                            WHERE url='%s' ''' % url).fetchone()
        # 检查和单词的关系
        if u is not None:
            v = self.db.execute('''SELECT *
                                FROM word_location
                                WHERE url_id=%d''' % u[0]).fetchone()
            if v is not None: return True
        return False

    # 链接两个url
    def add_link(self, url1, url2, link_text):
        words = self.split_text(link_text)
        id1 = self.get_entry_id('url_list', 'url', url1)
        id2 = self.get_entry_id('url_list', 'url', url2)
        if id1 == id2: return
        cur = self.db.execute('''INSERT INTO link(from_id, to_id) VALUES(%d,%d)''' % (id1, id2))
        link_id = cur.lastrowid
        for word in words:
            if word in ignore_words: continue
            word_id = self.get_entry_id('word_list', 'word', word)
            self.db.execute('''INSERT INTO link_words(link_id, word_id) VALUES(%d, %d)''' % (link_id, word_id))

    # 爬取网页
    def crawl(self, links, depth=2):
        for i in range(depth):
            new_links = set()
            for link in links:
                try:
                    req = requests.get(link)
                except:
                    # get失败
                    print('Could not get %s .' % link)
                    continue

                try:
                    soup = BeautifulSoup(req.text)
                    # 当前链接建立索引
                    self.add_to_index(link, soup)
                    self.db_commit()

                    c_links = soup('a')  # list
                    for c_link in c_links:  # soup
                        if 'href' in c_link.attrs:  # dict
                            url = parse.urljoin(link, c_link['href'])  # 智能连接

                            if url.find("'") != -1:  # 有一些特殊的网址很奇怪有引号
                                continue
                            url = url.split('#')[0]  # 去除位置标记
                            # 加入新链接
                            if url[:4] == 'http' and not self.is_indexed(url):
                                new_links.add(url)
                            # 获取url描述文本，连接两个url
                            c_link_text = self.get_text(c_link)
                            self.add_link(link, url, c_link_text)
                            self.db_commit()
                except:
                    # soup转化失败
                    print('Could not parse page %s .' % link)

            links = new_links

    # 初始化数据库
    def create_index_table(self):
        # 创建5各个表
        self.db.execute('''CREATE TABLE url_list
                            (url TEXT)''')

        self.db.execute('''CREATE TABLE word_list
                            (word TEXT)''')

        self.db.execute('''CREATE TABLE word_location
                            (url_id INTEGER,
                            word_id INTEGER,
                            location INTEGER)''')

        # 链接连接关系表
        self.db.execute('''CREATE TABLE link
                            (from_id INTEGER,
                            to_id INTEGER)''')
        # 链接连接边信息表：链接描述
        self.db.execute('''CREATE TABLE link_words
                            (word_id INTEGER,
                            link_id INTEGER)''')
        # 索引可以加快查询速度，但是会减慢插入速度
        self.db.execute('CREATE INDEX word_idx ON word_list(word)')
        self.db.execute('CREATE INDEX url_idx ON url_list(url)')
        self.db.execute('CREATE INDEX word_url_idx ON word_location(word_id)')
        self.db.execute('CREATE INDEX url_to_idx ON link(to_id)')
        self.db.execute('CREATE INDEX url_from_idx ON link(from_id)')

        self.db_commit()

In [3]:
spi = crawler('searchindex.db')
spi.create_index_table()



In [4]:
lst = ['https://www.baidu.com']
spi.crawl(lst)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Indexing https://www.baidu.com .
Indexing http://ir.baidu.com .
Indexing http://www.baidu.com/duty/ .
Indexing http://tieba.baidu.com .
Indexing http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1 .
Indexing http://jianyi.baidu.com/ .
Indexing http://v.baidu.com .
Indexing http://news.baidu.com .
Indexing http://map.baidu.com .
Indexing https://www.hao123.com .
Indexing http://home.baidu.com .
Indexing https://www.baidu.com/more/ .


In [17]:
# id为1的单词的（url, location）
out = [row for row in spi.db.execute('''SELECT url_id, location
                                        FROM word_location
                                        WHERE word_id=1''')]
print(out)

[(1, 0), (10, 0), (11, 0), (11, 4), (6, 0), (6, 3212), (6, 5971), (7, 0), (7, 109), (7, 111), (12, 0), (12, 8), (12, 47), (12, 49), (5, 0), (5, 1771), (5, 2194), (5, 2196), (2, 0), (2, 2897), (2, 3438), (2, 3468), (2, 3493), (2, 3523), (2, 3548), (2, 3578), (2, 3652), (2, 3682), (4, 0), (4, 420), (4, 434), (4, 478), (4, 492), (3, 0), (3, 260), (3, 22561), (3, 23298), (3, 23310), (3, 24543), (3, 24562), (3, 24580), (3, 24599), (3, 29205), (3, 29406), (3, 29916), (3, 30080), (3, 30243), (3, 30830), (3, 31024), (3, 31181), (3, 31321), (3, 31486), (3, 31637), (3, 32112), (3, 32487), (3, 32566), (3, 32643), (3, 32725), (3, 32857), (3, 32884), (3, 32936), (3, 32963), (3, 32996), (3, 33049), (3, 33103), (3, 33338), (3, 33387), (3, 33431), (3, 35197), (3, 35425), (3, 35556), (3, 35586), (3, 35616), (3, 35650), (3, 35684), (3, 35716), (3, 35749), (3, 35782), (3, 35813), (3, 35844), (3, 35875), (3, 35907), (3, 35938), (3, 35970), (3, 36000), (3, 36032), (3, 36062), (3, 36094), (3, 36124), (3, 36

In [95]:
print(type(z.contents[0]), z.contents)

<class 'bs4.element.NavigableString'> ['æ\x96°é\x97»']
