## 非协程实现

In [5]:
import time

def crawl_page(url):
    print(f"crawling:{url}")
    sleep_time = int(url.split('_')[-1])  # sleep_time取决url后面的数字
    time.sleep(sleep_time)
    print(f"OK：{url}")


def main(urls):
    for url in urls:
        crawl_page(url)

In [6]:
%time main(['url_1', 'url_2', 'url_3', 'url_4'])

crawling:url_1
OK：url_1
crawling:url_2
OK：url_2
crawling:url_3
OK：url_3
crawling:url_4
OK：url_4
CPU times: user 3.96 ms, sys: 1.93 ms, total: 5.89 ms
Wall time: 10 s


## 协程实现(同步协程)

In [22]:
# import asyncio

# asyncio.get_event_loop()

!pip install nest_asyncio
import nest_asyncio
nest_asyncio.apply()



In [27]:
import asyncio

async def crawl_page(url):
    print(f"crawling:{url}")
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print(f"OK：{url}")
    
async def main(urls):
    for url in urls:
        await crawl_page(url)

In [28]:
%time asyncio.run(main(['url_1', 'url_2', 'url_3', 'url_4']))
# %time await main(['url_1', 'url_2', 'url_3', 'url_4'])

crawling:url_1
OK：url_1
crawling:url_2
OK：url_2
crawling:url_3
OK：url_3
crawling:url_4
OK：url_4
CPU times: user 5.84 ms, sys: 2.59 ms, total: 8.43 ms
Wall time: 10 s


## 异步协程

In [31]:
import asyncio

async def crwal_page(url):
    print(f"crawling:{url}")
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print(f"OK：{url}")
    

async def main(urls):
    tasks = [asyncio.create_task(crwal_page(url)) for url in urls]  # 通过此方法创建任务
    for task in tasks:
        await task

In [32]:
%time asyncio.run(main(['url_1', 'url_2', 'url_3', 'url_4']))

crawling:url_1
crawling:url_2
crawling:url_3
crawling:url_4
OK：url_1
OK：url_2
OK：url_3
OK：url_4
CPU times: user 7.64 ms, sys: 3.12 ms, total: 10.8 ms
Wall time: 4 s


In [34]:
import asyncio

async def crwal_page(url):
    print(f"crawling:{url}")
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print(f"OK：{url}")
    

async def main(urls):
    tasks = [asyncio.create_task(crwal_page(url)) for url in urls]  # 通过此方法创建任务
    await asyncio.gather(*tasks)

In [35]:
%time asyncio.run(main(['url_1', 'url_2', 'url_3', 'url_4']))

crawling:url_1
crawling:url_2
crawling:url_3
crawling:url_4
OK：url_1
OK：url_2
OK：url_3
OK：url_4
CPU times: user 7.23 ms, sys: 2.83 ms, total: 10.1 ms
Wall time: 4 s


## 协程运行时

In [38]:
import asyncio

async def worker_1():
    print("worker_1 start")
    await asyncio.sleep(1)
    print("worker_1 done")
    
async def worker_2():
    print("worker_2 start")
    await asyncio.sleep(2)
    print("worker_2 done")
    
async def main():
    print("before await")
    await worker_1()
    print("awaited worker_1")
    await worker_2()
    print("awaited worker_2")

In [39]:
%time asyncio.run(main())

before await
worker_1 start
worker_1 done
awaited worker_1
worker_2 start
worker_2 done
awaited worker_2
CPU times: user 3.81 ms, sys: 2.13 ms, total: 5.94 ms
Wall time: 3 s


In [42]:
import asyncio

async def worker_1():
    print("worker_1 start")
    await asyncio.sleep(1)
    print("worker_1 done")
    
async def worker_2():
    print("worker_2 start")
    await asyncio.sleep(2)
    print("worker_2 done")
    
async def main():
    task1 = asyncio.create_task(worker_1())
    task2 = asyncio.create_task(worker_2())
    print("before await")
    await task1
    print("awaited worker_1")
    await task2
    print("awaited worker_2")

In [43]:
%time asyncio.run(main())

before await
worker_1 start
worker_2 start
worker_1 done
awaited worker_1
worker_2 done
awaited worker_2
CPU times: user 3.98 ms, sys: 1.78 ms, total: 5.76 ms
Wall time: 2 s


- 给某些协程任务限定运行时间，一旦超时就取消

In [47]:
import asyncio

async def worker_1():
    await asyncio.sleep(1)
    return 1

async def worker_2():
    await asyncio.sleep(2)
    return 2 / 0

async def worker_3():
    await asyncio.sleep(3)
    return 3

async def main():
    task1 = asyncio.create_task(worker_1())
    task2 = asyncio.create_task(worker_2())
    task3 = asyncio.create_task(worker_3())
    
    await asyncio.sleep(2)
    task3.cancel()
    
    res = await asyncio.gather(task1, task2, task3, return_exceptions=True)
    print(res)

In [48]:
%time asyncio.run(main())

[1, ZeroDivisionError('division by zero'), CancelledError()]
CPU times: user 1.89 ms, sys: 1.28 ms, total: 3.18 ms
Wall time: 2 s


## 协程实现生产者消费者模型

In [49]:
import asyncio
import random


async def consumer(queue, id):
    while True:
        value = await queue.get()
        print(f"{id} get a value: {value}")
        await asyncio.sleep(1)
        
async def producer(queue, id):
    for i in range(5):
        value = random.randint(1, 10)
        await queue.put(value)
        print(f"{id} put a value: {value}")
        await asyncio.sleep(1)
        
async def main():
    queue = asyncio.Queue()
    
    consumer1 = asyncio.create_task(consumer(queue, 'consumer1'))
    consumer2 = asyncio.create_task(consumer(queue, 'consumer2'))
    
    producer1 = asyncio.create_task(producer(queue, 'producer1'))
    producer2 = asyncio.create_task(producer(queue, 'producer2'))
    
    await asyncio.sleep(10)
    consumer1.cancel()
    consumer2.cancel()
    
    await asyncio.gather(consumer1, consumer2, producer1, producer2, return_exceptions=True)

%time asyncio.run(main())


producer1 put a value: 6
producer2 put a value: 1
consumer1 get a value: 6
consumer2 get a value: 1
producer1 put a value: 5
producer2 put a value: 10
consumer1 get a value: 5
consumer2 get a value: 10
producer1 put a value: 4
producer2 put a value: 2
consumer1 get a value: 4
consumer2 get a value: 2
producer1 put a value: 1
producer2 put a value: 6
consumer1 get a value: 1
consumer2 get a value: 6
producer1 put a value: 8
producer2 put a value: 3
consumer1 get a value: 8
consumer2 get a value: 3
CPU times: user 8.75 ms, sys: 2.68 ms, total: 11.4 ms
Wall time: 10 s


## 豆瓣近日推荐电影爬虫

### 同步版本

In [76]:
import requests
from bs4 import BeautifulSoup


head = {'User-Agent': 'test'}

def main():
    url = "https://movie.douban.com/cinema/later/beijing/"
    init_page = requests.get(url, headers=head).content
    init_soup = BeautifulSoup(init_page, 'lxml')
    
    all_movies = init_soup.find('div', id="showing-soon")
    for each_movie in all_movies.find_all('div', class_="item"):
        all_a_tag = each_movie.find_all('a')
        all_li_tag = each_movie.find_all('li')
        
        movie_name = all_a_tag[1].text
        url_to_fetch = all_a_tag[1]['href']
        movie_date = all_li_tag[0].text
        
        resp_item = requests.get(url_to_fetch).content
        soup_item = BeautifulSoup(resp_item, 'lxml')
        img_tag = soup_item.find("img")
        
        # print(f"{movie_name} {movie_date} {img_tag['src']}")
        print(f"{movie_name} {movie_date}")

In [77]:
%time main()

我的非凡父母 09月02日
世间有她 09月09日
海的尽头是草原 09月09日
断网 09月09日
还是觉得你最好 09月09日
重回地球 09月09日
我要和你在一起 09月09日
狼群 09月09日
请别相信她 09月09日
妈妈！ 09月10日
青蛙王国之极限运动 09月10日
热汤 09月16日
追梦少年 09月16日
撼沙 10月01日
红孩儿之初生牛犊 10月01日
我是霸王龙 10月01日
小美人鱼之大海怪传说 10月01日
新灰姑娘2 10月01日
绑架游戏 12月23日
龙马精神 12月31日
透明侠侣 12月31日
绝望主夫 12月31日
CPU times: user 368 ms, sys: 24 ms, total: 392 ms
Wall time: 5.44 s


### 协程版本

In [58]:
# !pip install aiohttp

In [61]:

import asyncio
import aiohttp

from bs4 import BeautifulSoup

header={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36"}

async def fetch_content(url):
    async with aiohttp.ClientSession(
        headers=header, connector=aiohttp.TCPConnector(ssl=False)
    ) as session:
        async with session.get(url) as response:
            return await response.text()

async def main():
    url = "https://movie.douban.com/cinema/later/beijing/"
    init_page = await fetch_content(url)
    init_soup = BeautifulSoup(init_page, 'lxml')

    movie_names, urls_to_fetch, movie_dates = [], [], []

    all_movies = init_soup.find('div', id="showing-soon")
    for each_movie in all_movies.find_all('div', class_="item"):
        all_a_tag = each_movie.find_all('a')
        all_li_tag = each_movie.find_all('li')

        movie_names.append(all_a_tag[1].text)
        urls_to_fetch.append(all_a_tag[1]['href'])
        movie_dates.append(all_li_tag[0].text)

    tasks = [fetch_content(url) for url in urls_to_fetch]
    pages = await asyncio.gather(*tasks)

    for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):
        soup_item = BeautifulSoup(page, 'lxml')
        img_tag = soup_item.find('img')

        print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))

%time asyncio.run(main())

我的非凡父母 09月02日 https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2877643228.jpg
世间有她 09月09日 https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2878770125.jpg
海的尽头是草原 09月09日 https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2877827228.jpg
断网 09月09日 https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2872591745.jpg
还是觉得你最好 09月09日 https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2878576476.jpg
重回地球 09月09日 https://img2.doubanio.com/view/photo/s_ratio_poster/public/p2872616271.jpg
我要和你在一起 09月09日 https://img2.doubanio.com/view/photo/s_ratio_poster/public/p2879022921.jpg
狼群 09月09日 https://img2.doubanio.com/view/photo/s_ratio_poster/public/p2878713351.jpg
请别相信她 09月09日 https://img2.doubanio.com/view/photo/s_ratio_poster/public/p2876821751.jpg
妈妈！ 09月10日 https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2878837838.jpg
青蛙王国之极限运动 09月10日 https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2876826798.jpg
热汤 09月16日 https://img2.doubanio