# 协程

## 爬虫

In [1]:
import time

def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    time.sleep(sleep_time)
    print('OK {}'.format(url))

def main(urls):
    for url in urls:
        crawl_page(url)

%time main(['url_1', 'url_2', 'url_3', 'url_4'])

crawling url_1
OK url_1
crawling url_2
OK url_2
crawling url_3
OK url_3
crawling url_4
OK url_4
CPU times: user 5.15 ms, sys: 3.98 ms, total: 9.13 ms
Wall time: 10 s


并发优化：

In [5]:
%pip install nest_asyncio

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Note: you may need to restart the kernel to use updated packages.


In [9]:
import asyncio
import nest_asyncio
nest_asyncio.apply()

async def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print('OK {}'.format(url))

async def main(urls):
    for url in urls:
        await crawl_page(url)

%time asyncio.run(main(['url_1', 'url_2', 'url_3', 'url_4']))

crawling url_1
OK url_1
crawling url_2
OK url_2
crawling url_3
OK url_3
crawling url_4
OK url_4
CPU times: user 6.38 ms, sys: 5.01 ms, total: 11.4 ms
Wall time: 10 s


In [11]:
import asyncio

async def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print('OK {}'.format(url))

async def main(urls):
    tasks = [asyncio.create_task(crawl_page(url)) for url in urls]
    for task in tasks:
        await task

%time asyncio.run(main(['url_1', 'url_2', 'url_3', 'url_4']))



crawling url_1
crawling url_2
crawling url_3
crawling url_4
OK url_1
OK url_2
OK url_3
OK url_4
CPU times: user 7.39 ms, sys: 5.14 ms, total: 12.5 ms
Wall time: 4 s


In [12]:
import asyncio

async def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print('OK {}'.format(url))

async def main(urls):
    tasks = [asyncio.create_task(crawl_page(url)) for url in urls]
    await asyncio.gather(*tasks)

%time asyncio.run(main(['url_1', 'url_2', 'url_3', 'url_4']))


crawling url_1
crawling url_2
crawling url_3
crawling url_4
OK url_1
OK url_2
OK url_3
OK url_4
CPU times: user 7.99 ms, sys: 6.03 ms, total: 14 ms
Wall time: 4 s


In [14]:
import asyncio

async def worker_1():
    await asyncio.sleep(1)
    return 1

async def worker_2():
    await asyncio.sleep(2)
    return 2 / 0

async def worker_3():
    await asyncio.sleep(3)
    return 3

async def main():
    task_1 = asyncio.create_task(worker_1())
    task_2 = asyncio.create_task(worker_2())
    task_3 = asyncio.create_task(worker_3())

    await asyncio.sleep(2)
    task_3.cancel()

    res = await asyncio.gather(task_1, task_2, task_3, return_exceptions=True)
    print(res)

%time asyncio.run(main())


[1, ZeroDivisionError('division by zero'), CancelledError('')]
CPU times: user 1.75 ms, sys: 2.03 ms, total: 3.79 ms
Wall time: 2 s


## 生产者消费者

In [15]:
import random
import asyncio

async def consumer(queue, id):
    while True:
        val =  await queue.get()
        print("comsumer {} consume msg {}".format(id, val))
        await asyncio.sleep(1)
        

async def producer(queue, id):
    for i in range(10):
        val = random.randint(1, 10)
        await queue.put(val)
        print("producer {} produce msg {}".format(id, val))
        await asyncio.sleep(1) 

async def main():
    queue = asyncio.Queue()

    consumer_1 = asyncio.create_task(consumer(queue, 'consumer_1'))
    consumer_2 = asyncio.create_task(consumer(queue, 'consumer_2'))

    producer_1 = asyncio.create_task(producer(queue, 'producer_1'))
    producer_2 = asyncio.create_task(producer(queue, 'producer_2'))

    await asyncio.sleep(10)
    consumer_1.cancel()
    consumer_2.cancel()
    
    await asyncio.gather(consumer_1, consumer_2, producer_1, producer_2, return_exceptions=True)

%time asyncio.run(main())

producer producer_1 produce msg 7
producer producer_2 produce msg 5
comsumer consumer_1 consume msg 7
comsumer consumer_2 consume msg 5
producer producer_1 produce msg 10
producer producer_2 produce msg 2
comsumer consumer_1 consume msg 10
comsumer consumer_2 consume msg 2
producer producer_1 produce msg 1
producer producer_2 produce msg 9
comsumer consumer_1 consume msg 1
comsumer consumer_2 consume msg 9
producer producer_1 produce msg 5
producer producer_2 produce msg 1
comsumer consumer_1 consume msg 5
comsumer consumer_2 consume msg 1
producer producer_1 produce msg 5
producer producer_2 produce msg 4
comsumer consumer_1 consume msg 5
comsumer consumer_2 consume msg 4
producer producer_1 produce msg 1
producer producer_2 produce msg 6
comsumer consumer_1 consume msg 1
comsumer consumer_2 consume msg 6
producer producer_1 produce msg 1
producer producer_2 produce msg 6
comsumer consumer_1 consume msg 1
comsumer consumer_2 consume msg 6
producer producer_1 produce msg 8
producer pro

## 实战：豆瓣近日推荐电影爬虫

In [19]:
%pip install lxml

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting lxml
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/94/6a/42141e4d373903bfea6f8e94b2f554d05506dfda522ada5343c651410dc8/lxml-5.3.0-cp313-cp313-macosx_10_13_universal2.whl (8.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lxml
Successfully installed lxml-5.3.0
Note: you may need to restart the kernel to use updated packages.


In [15]:
import requests
from bs4 import BeautifulSoup

def main():
    url = "https://movie.douban.com/cinema/later/beijing/"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/521.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
    init_page = requests.get(url, headers=headers).content
    init_soup = BeautifulSoup(init_page, 'lxml')

    all_movies = init_soup.find('div', id="showing-soon")
    if all_movies:
        for each_movie in all_movies.find_all('div', class_="item"):
            all_a_tag = each_movie.find_all('a')
            # print(all_a_tag)
            
            all_li_tag = each_movie.find_all('li')
    
            movie_name = all_a_tag[1].text
            url_to_fetch = all_a_tag[1]['href']
            movie_date = all_li_tag[0].text
    
            # response_item = requests.get(url_to_fetch).content
            # soup_item = BeautifulSoup(response_item, 'lxml')
            # img_tag = all_a_tag[0]['img']
    
            print('{} {} {}'.format(movie_name, movie_date, 1))
    else:
        print("all movies is none")

%time main()

去唱卡拉OK吧！ 01月17日 1
狗的审判 01月17日 1
莫莉的冒险 01月18日 1
真爱找麻烦！ 01月18日 1
笑傲江湖 01月28日 1
射雕英雄传：侠之大者 01月29日 1
封神第二部：战火西岐 01月29日 1
哪吒之魔童闹海 01月29日 1
蛟龙行动 01月29日 1
唐探1900 01月29日 1
熊出没·重启未来 01月29日 1
祭屋 01月30日 1
美国队长4 02月14日 1
我们的命中注定 02月14日 1
真爱营业 02月14日 1
多幸运遇见你 02月14日 1
花样年华 02月 1
7天 03月14日 1
午夜怨灵 03月14日 1
苍茫的天涯是我的爱 05月01日 1
CPU times: user 35.4 ms, sys: 4.62 ms, total: 40 ms
Wall time: 909 ms


# Futures

# Asyncio