**什么是协程？**

协程是实现并发编程的一种方式。一说并发，你肯定想到了多线程/多进程模型，没错，多线程/多进程，正是解决并发问题的经典模型之一。最初的互联网世界，多线程/多进程在服务器并发中，起到了举足轻重的作用。



In [2]:
import time

def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    time.sleep(sleep_time)
    print('OK {}'.format(url))

def main(urls):
    for url in urls:
        crawl_page(url)

%time main(['url_1', 'url_2', 'url_3', 'url_4'])

crawling url_1
OK url_1
crawling url_2
OK url_2
crawling url_3
OK url_3
crawling url_4
OK url_4
CPU times: user 6.1 ms, sys: 4.92 ms, total: 11 ms
Wall time: 10 s


In [7]:
import asyncio
import nest_asyncio

nest_asyncio.apply()
    
async def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print('OK {}'.format(url))

async def main(urls):
    for url in urls:
        await crawl_page(url)

%time asyncio.run(main(['url_1', 'url_2', 'url_3', 'url_4']))

crawling url_1
OK url_1
crawling url_2
OK url_2
crawling url_3
OK url_3
crawling url_4
OK url_4
CPU times: user 7.79 ms, sys: 3.76 ms, total: 11.5 ms
Wall time: 10 s


10秒就对了，还记得上面所说的，await是同步调用，因此，crawl_page(url)在当前的调用结束之前，是不会触发下一次调用的。于是，这个代码效果和上面完全一样了，相当于我们用异步代码写了个同步代码。

现在又该怎么办呢？

其实很简单，也正是我接下来要讲的协程中的一个重要概念，任务（Task）。老规矩，先看代码


In [9]:
import asyncio
import nest_asyncio

nest_asyncio.apply()

async def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print('OK {}'.format(url))

async def main(urls):
    tasks = [asyncio.create_task(crawl_page(url)) for url in urls]
    for task in tasks:
        await task

%time asyncio.run(main(['url_1', 'url_2', 'url_3', 'url_4']))

crawling url_1
crawling url_2
crawling url_3
crawling url_4
OK url_1
OK url_2
OK url_3
OK url_4
CPU times: user 7.32 ms, sys: 3.53 ms, total: 10.9 ms
Wall time: 4.01 s


In [12]:
import asyncio
import nest_asyncio

nest_asyncio.apply()

async def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print('OK {}'.format(url))
    
async def main(urls):
    tasks = [asyncio.create_task(crawl_page(url)) for url in urls]
    await asyncio.gather(*tasks)
    
%time asyncio.run(main(['url_1', 'url_2', 'url_3', 'url_4']))

crawling url_1
crawling url_2
crawling url_3
crawling url_4
OK url_1
OK url_2
OK url_3
OK url_4
CPU times: user 6.83 ms, sys: 4.67 ms, total: 11.5 ms
Wall time: 4 s


In [20]:
import asyncio
import nest_asyncio

nest_asyncio.apply()

async def worker_1():
    print('worker_1 start')
    await asyncio.sleep(1)
    print('worker_1 end')

async def worker_2():
    print('worker_2 start')
    await asyncio.sleep(2)
    print('worker_2 end')

async def main():
    task1 = asyncio.create_task(worker_1())
    task2 = asyncio.create_task(worker_2())
    print('before await')
    await task1
    print('awaited worker_1')
    await task2
    print('awaited worker_2')

%time asyncio.run(main())

before await
CPU times: user 905 µs, sys: 312 µs, total: 1.22 ms
Wall time: 1.23 ms
worker_2 start
worker_1 start
worker_1 end
worker_2 end


before await
worker_1 start
worker_2 start
worker_1 end
awaited worker_1
worker_2 end
awaited worker_2

In [23]:
import asyncio

async def worker_1():
    await asyncio.sleep(1)
    return 1

async def worker_2():
    await asyncio.sleep(2)
    return 2 / 0

async def worker_3():
    await asyncio.sleep(3)
    return 3

async def main():
    task_1 = asyncio.create_task(worker_1())
    task_2 = asyncio.create_task(worker_2())
    task_3 = asyncio.create_task(worker_3())
    
    await asyncio.sleep(2)
    task_3.cancel()
    
    res = await asyncio.gather(task_1, task_2, task_3, return_exceptions=True)
    print(res)

%time asyncio.run(main())

[1, ZeroDivisionError('division by zero'), CancelledError('')]
CPU times: user 2.99 ms, sys: 3.05 ms, total: 6.03 ms
Wall time: 2.01 s


In [25]:
import asyncio
import random

async def consumer(queue, id):
    while True:
        val = await queue.get()
        print('{} get a value: {}'.format(id, val))
        await asyncio.sleep(1)

async def producer(queue, id):
    for i in range(5):
        val = random.randint(1, 10)
        await queue.put(val)
        print('{} put a value: {}'.format(id, val))
        await asyncio.sleep(1)

async def main():
    queue = asyncio.Queue()
    
    consumer_1 = asyncio.create_task(consumer(queue, 'consumer_1'))
    consumer_2 = asyncio.create_task(consumer(queue, 'consumer_2'))
    
    producer_1 = asyncio.create_task(producer(queue, 'producer_1'))
    producer_2 = asyncio.create_task(producer(queue, 'producer_2'))
    
    await asyncio.sleep(10)
    consumer_1.cancel()
    consumer_2.cancel()
    
    await asyncio.gather(consumer_1, consumer_2, producer_1, producer_2, return_exceptions=True)
    
%time asyncio.run(main())    

producer_1 put a value: 9
producer_2 put a value: 4
consumer_1 get a value: 9
consumer_2 get a value: 4
producer_1 put a value: 2
producer_2 put a value: 7
consumer_1 get a value: 2
consumer_2 get a value: 7
producer_1 put a value: 9
producer_2 put a value: 4
consumer_1 get a value: 9
consumer_2 get a value: 4
producer_1 put a value: 10
producer_2 put a value: 3
consumer_1 get a value: 10
consumer_2 get a value: 3
producer_1 put a value: 7
producer_2 put a value: 10
consumer_1 get a value: 7
consumer_2 get a value: 10
CPU times: user 12.8 ms, sys: 6.31 ms, total: 19.1 ms
Wall time: 10 s


In [16]:
import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
    'Referer':'https://time.geekbang.org/column/article/101855',
    'Connection':'keep-alive'
}


def main():
    url = "https://movie.douban.com/cinema/nowplaying/shanghai"
    init_page = requests.get(url, headers=headers).content
    init_soup = BeautifulSoup(init_page, 'lxml')
    
    all_movies = init_soup.find('div', id="upcoming")
    for each_movie in all_movies.find_all('li', class_="list-item"):
        movie_name = each_movie['data-title']
        movie_date = each_movie.find('li', class_='release-date').string.strip()
        movie_src = each_movie.find('li', class_='stitle').a['href']
        print('{} {} {}'.format(movie_name, movie_date, movie_src))

%time main()

我要我们在一起 05月20日上映 https://movie.douban.com/subject/25881778/?from=playing_poster
唐顿庄园2 05月20日上映 https://movie.douban.com/subject/35008440/?from=playing_poster
爱犬奇缘 05月20日上映 https://movie.douban.com/subject/35440759/?from=playing_poster
异兽 05月20日上映 https://movie.douban.com/subject/35817963/?from=playing_poster
黎乡遇见你 05月20日上映 https://movie.douban.com/subject/35859238/?from=playing_poster
牧民省长尕布龙 05月20日上映 https://movie.douban.com/subject/35630352/?from=playing_poster
青春是首歌 05月27日上映 https://movie.douban.com/subject/35295545/?from=playing_poster
盒子的秘密 05月27日上映 https://movie.douban.com/subject/35561858/?from=playing_poster
刿心剑 05月28日上映 https://movie.douban.com/subject/35352814/?from=playing_poster
魔法鼠乐园 06月01日上映 https://movie.douban.com/subject/30487738/?from=playing_poster
CPU times: user 267 ms, sys: 79.4 ms, total: 346 ms
Wall time: 2.16 s


In [15]:
import asyncio
import aiohttp

from bs4 import BeautifulSoup

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
    'Referer':'https://time.geekbang.org/column/article/101855',
    'Connection':'keep-alive'
}

url = "https://movie.douban.com/cinema/nowplaying/shanghai"

async def fetch_content(url):
    async with aiohttp.ClientSession(
    headers=headers, connector=aiohttp.TCPConnector(ssl=False)) as session:
        async with session.get(url) as response:
            return await response.text()

async def main():
    init_page = await fetch_content(url) 
    init_soup = BeautifulSoup(init_page, 'lxml')
    
