In [2]:
import aiohttp
import asyncio
import os
import datetime
import json
import pickle
import pandas as pd
import functools

import AiohttpFrame_Config
from AiohttpFrame_Browser import AiohttpFrame_Browser

In [5]:
class Aiohttpframe_FetchAndStore():
    '''
    version 0.0
    need finish delegate functions, example:
    
    if __name__ == '__main__':
    class FetchAndStore(Aiohttpframe_FetchAndStore):
        def delegate_prepare_data(self,response):
            data = json.loads(response)
            start = data['trends_range']['start']
            end = data['trends_range']['end']
            dateindex = pd.date_range(start=start,end=end,freq='D')
            keyword = list(data['trends'].keys())[0]
            datas = data['trends'][keyword]
            df = pd.DataFrame({'date':dateindex,'keyword':keyword,'data':datas})
            return df

        def delegate_store_data(self,data):
            data.to_csv(f"{data['keyword'][0]}_{data['date'][0].strftime('%Y-%m-%d')}",index=False)
            return True

        def get_date_list(self,sd_date,ed_date,period=7,timefmt=None):
            # 得到日期的list[{sd_str,ed_str}]
            date_list = []
            count = (ed_date - sd_date).days // period + 1
            for i in range(1,count+1):
                date_list.append({'sd_str':(sd_date+datetime.timedelta(days=period*(i-1))).strftime(timefmt),
                                  'ed_str':(sd_date+datetime.timedelta(days=period*(i)-1)).strftime(timefmt)}) 
            date_list[-1]['ed_str'] = ed_date.strftime('%Y-%m-%d')  # 最后一组的最后一个日期 = EndDate
            return date_list
    
        def prepare_urls(self,region='0',category='0',is_hourly='0',start=None,end=None,period=7,keywords=None):
            urlformat = 'https://index.toutiao.com/api/keyword/trends?' \
            'region={region}&category={category}&keyword={keyword}&start={start}&end={end}&is_hourly={is_hourly}'
            timefmt = '%Y-%m-%d'      
            sd_date = datetime.datetime.strptime(start,timefmt)
            ed_date = datetime.datetime.strptime(end,timefmt)
            date_list = self.get_date_list(sd_date,ed_date,period=period,timefmt=timefmt)
            urls = (urlformat.format(region=region,category=category,is_hourly=is_hourly,
                                     start=d['sd_str'].replace('-',''),
                                     end=d['ed_str'].replace('-',''),
                                     keyword=keyword) for d in date_list for keyword in keywords)
            return urls

    if __name__ == '__main__':
        astime = 0.1
        FAS = FetchAndStore(astime=astime)
        region = '0' # 全国
        category = '0'
        keywords =  ['逍客','指南者']# 可变 
        start = '2019-03-01'#  可变 >8天不能跑出数据
        end = '2019-04-20'# 可变
        is_hourly = '0' 
        urls = FAS.prepare_urls(start=start,end=end,period=7,keywords=keywords)
        results = asyncio.run(FAS.get_data_from_urls(urls))
    '''       
    def __init__(self,chrome_path=None,cookies_jsonfile=None,headers=None,astime=0.1):        
        self.chrome_path = AiohttpFrame_Config.chrome_path if not chrome_path else chrome_path
        self.cookies_jsonfile = AiohttpFrame_Config.cookies_jsonfile if not cookies_jsonfile else cookies_jsonfile
        if not os.path.exists(self.cookies_jsonfile):
            json.dump({},open(self.cookies_jsonfile,'w'))
        self.headers = AiohttpFrame_Config.headers if not headers else headers
        self.astime = astime
        self.Browser = AiohttpFrame_Browser(chrome_path=self.chrome_path, 
                                            cookies_jsonfile=self.cookies_jsonfile,
                                            headers=self.headers)
        self.cookies = self.Browser.ab_load_cookies()

    async def fetch_url(self,session, url):
        # 获取url返回的信息
        async with session.get(url) as response:
            await asyncio.sleep(self.astime)
            return await response.text()

    async def preprare_data(self,response):
        # 对于url返回信息进行预处理
        return self.delegate_prepare_data(response)
    def delegate_prepare_data(self,response):
        raise NotImplemented

    async def store_data(self,data):
        # 存贮信息
        return self.delegate_store_data(data)
    def delegate_store_data(self,data):
        raise NotImplemented
        
    async def get_data_from_url(self,url):
        # 自定义url
        async with aiohttp.ClientSession(headers=self.headers,cookies=self.cookies) as session:
            response = await self.fetch_url(session, url)
            data = await self.preprare_data(response)
            storedata = await self.store_data(data)
        return data

    async def get_data_from_urls(self,urls):
        tasks = [asyncio.ensure_future(self.get_data_from_url(url)) for url in urls]
        if self.astime > 0:
            for task in tasks:
                await task
                await asyncio.sleep(self.astime) # 间隔
            return asyncio.Task.all_tasks()
        else:
            result = await asyncio.wait(tasks)
            return result[0]
    
    def prepare_urls(self):
        raise NotImplemented


In [8]:
if __name__ == '__main__':
    class FetchAndStore(Aiohttpframe_FetchAndStore):
        def delegate_prepare_data(self,response):
            data = json.loads(response)
            start = data['trends_range']['start']
            end = data['trends_range']['end']
            dateindex = pd.date_range(start=start,end=end,freq='D')
            keyword = list(data['trends'].keys())[0]
            datas = data['trends'][keyword]
            df = pd.DataFrame({'date':dateindex,'keyword':keyword,'data':datas})
            return df

        def delegate_store_data(self,data):
            data.to_csv(f"{data['keyword'][0]}_{data['date'][0].strftime('%Y-%m-%d')}",index=False)
            return True

        def get_date_list(self,sd_date,ed_date,period=7,timefmt=None):
            # 得到日期的list[{sd_str,ed_str}]
            date_list = []
            count = (ed_date - sd_date).days // period + 1
            for i in range(1,count+1):
                date_list.append({'sd_str':(sd_date+datetime.timedelta(days=period*(i-1))).strftime(timefmt),
                                  'ed_str':(sd_date+datetime.timedelta(days=period*(i)-1)).strftime(timefmt)}) 
            date_list[-1]['ed_str'] = ed_date.strftime('%Y-%m-%d')  # 最后一组的最后一个日期 = EndDate
            return date_list
    
        def prepare_urls(self,region='0',category='0',is_hourly='0',start=None,end=None,period=7,keywords=None):
            urlformat = 'https://index.toutiao.com/api/keyword/trends?' \
            'region={region}&category={category}&keyword={keyword}&start={start}&end={end}&is_hourly={is_hourly}'
            timefmt = '%Y-%m-%d'      
            sd_date = datetime.datetime.strptime(start,timefmt)
            ed_date = datetime.datetime.strptime(end,timefmt)
            date_list = self.get_date_list(sd_date,ed_date,period=period,timefmt=timefmt)
            urls = (urlformat.format(region=region,category=category,is_hourly=is_hourly,
                                     start=d['sd_str'].replace('-',''),
                                     end=d['ed_str'].replace('-',''),
                                     keyword=keyword) for d in date_list for keyword in keywords)
            return urls

if __name__ == '__main__':
    astime = 0.1
    FAS = FetchAndStore(astime=astime)
    region = '0' # 全国
    category = '0'
    keywords =  ['逍客','指南者']# 可变 
    start = '2019-03-01'#  可变 >8天不能跑出数据
    end = '2019-04-20'# 可变
    is_hourly = '0' 
    urls = FAS.prepare_urls(start=start,end=end,period=7,keywords=keywords)
    results = asyncio.run(FAS.get_data_from_urls(urls))

In [9]:
results

{<Task finished coro=<Aiohttpframe_FetchAndStore.get_data_from_url() done, defined at <ipython-input-5-807cd11fba68>:34> result=        date ...  2.809630e+06>,
 <Task finished coro=<Aiohttpframe_FetchAndStore.get_data_from_url() done, defined at <ipython-input-5-807cd11fba68>:34> result=        date ...  45515.250000>,
 <Task finished coro=<Aiohttpframe_FetchAndStore.get_data_from_url() done, defined at <ipython-input-5-807cd11fba68>:34> result=        date ...  6.227760e+05>,
 <Task finished coro=<Aiohttpframe_FetchAndStore.get_data_from_url() done, defined at <ipython-input-5-807cd11fba68>:34> result=        date ...  6.681773e+05>,
 <Task finished coro=<Aiohttpframe_FetchAndStore.get_data_from_url() done, defined at <ipython-input-5-807cd11fba68>:34> result=        date ...  8.187403e+06>,
 <Task finished coro=<Aiohttpframe_FetchAndStore.get_data_from_url() done, defined at <ipython-input-5-807cd11fba68>:34> result=        date ...  8.579035e+06>,
 <Task finished coro=<Aiohttpframe