# Snippets

In this snippets notebook, I provided some useful functions while I'm using python, including the some crawler, quantative trading, and database constructing area.

## Reduce Memory Usage

To reduce the memory of a DataFrame

In [1]:
import numpy as np


def reduce_mem_usage(df):
    """iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

## Cache Wrapper

To save some middle result to cache for later use

In [4]:
import hashlib
from diskcache import Cache
from functools import wraps


def cache_wrapper(directory: str = './cache/', expire: int = 3600):
    cache = Cache(directory=directory)
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            key = func.__name__ + ':' + hashlib.md5((func.__name__ + str(args) + str(kwargs)).encode('utf-8')).hexdigest()
            result = cache.get(key=key)
            if result is not None:
                return result
            result = func(*args, **kwargs)
            cache.set(key=key, value=result, expire=expire)
            return result
        return wrapper
    return decorator

## Fetch Proxy

Getting some proxies for your crawl

In [None]:
import time
import requests
import pandas as pd


def get_proxy(page_size: int = 20):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
    }
    url_list = [f'https://free.kuaidaili.com/free/inha/{i}/' for i in range(1, page_size + 1)]
    proxies = []
    for url in url_list:
        data = pd.read_html(url)[0][['IP', 'PORT', '类型']].drop_duplicates()
        print(f'[+] {url} Get Success!')
        data['类型'] = data['类型'].str.lower()
        proxy = (data['类型'] + '://' + data['IP'] + ':' + data['PORT'].astype('str')).to_list()
        proxies += list(map(lambda x: {x.split('://')[0]: x}, proxy))
        time.sleep(0.8)
    available_proxies = []
    
    for proxy in proxies:
        try:
            res = requests.get('https://www.baidu.com', 
                headers=headers, proxies=proxy, timeout=1)
            res.raise_for_status()
            available_proxies.append(proxy)
        except Exception as e:
            print(str(e))
    
    print(f'[=] Get {len(proxies)} proxies, while {len(available_proxies)} are available. '
        f'Current available rate is {len(available_proxies) / len(proxies) * 100:.2f}%')
    return 

## Chinese Holidays

To get chinese holiday for constructing a pd.DateOffset, which will help you in chinese market trading date frequency

In [None]:
import requests
import pandas as pd


def chinese_holidays():
    root = 'https://api.apihubs.cn/holiday/get'
    complete = False
    page = 1
    holidays = []
    while not complete:
        params = f'?field=date&holiday_recess=1&cn=1&page={page}&size=366'
        url = root + params
        data = requests.get(url, verbose=False).get().json['data']
        if data['page'] * data['size'] >= data['total']:
            complete = True
        days = pd.DataFrame(data['list']).date.astype('str')\
            .astype('datetime64[ns]').to_list()
        holidays += days
        page += 1
    return 

## Proxy Request

A request method tries to get data by proxy, but differently, it will not stop requesting until the website is successfully got.

In [1]:
import time
import random
import requests

def proxy_request(
    url: str, 
    proxies: 'dict | list', 
    retry: int = None, 
    timeout: int = 1,
    delay: int = 0,
    verbose: bool = True,
    **kwargs
):
    if isinstance(proxies, dict):
        proxies = [proxies]
    retry = retry or len(proxies)
    random.shuffle(proxies) 
    for try_times, proxy in enumerate(proxies):
        if try_times + 1 <= retry:
            try:
                response = requests.get(url, proxies=proxy, timeout=timeout, **kwargs)
                response.raise_for_status()
                if verbose:
                    print(f'[+] {url}, try {try_times + 1}/{retry}')
                return response
            except Exception as e:
                if verbose:
                    print(f'[-] [{e}] {url}, try {try_times + 1}/{retry}')
                time.sleep(delay)

## Chinese Stock Code Wrapper

This function can help you wrap the bare stock code with market identifier

In [None]:
import re

def wrap_code(
    code: str, 
    formatstr: str = '{code}.{market}', 
    style: str = 'wind',
):
    if not code.isdigit():
        raise ValueError('It seems your code has already been wrapped')
    
    sh_code_pat = '6\d{5}|9\d{5}'
    sz_code_pat = '0\d{5}|2\d{5}|3\d{5}'
    bj_code_pat = '8\d{5}|4\d{5}'

    if style == 'wind':
        sh_market = 'SH'
        sz_market = 'SZ'
        bj_market = 'BJ'
    elif style == 'rq' or style == 'jq':
        sh_market = 'XSHG'
        sz_market = 'XSHE'
        bj_market = ''
    else:
        raise ValueError('Style must be one of `wind`, `rq` or `jq`')
        
    if re.match(sh_code_pat, code):
        return formatstr.format(code=code, market='sh')
    if re.match(sz_code_pat, code):
        return formatstr.format(code=code, market='sz')
    if re.match(bj_code_pat, code):
        return formatstr.format(code=code, market='bj')

## StockUS Crawler

A crawler designed to get data from [stockus](https://stock.us), providing get index price and get stock price, get report list and search for report fucntions

In [38]:
import datetime


class StockUS:
    
    __root = "https://api.stock.us/api/v1/"
    headers = {
        "Host": "api.stock.us",
        "Origin": "https://stock.us",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6 Safari/605.1.15",
        "Accept-Language": "zh-CN",
    }
    category = {
        1: "宏观经济",
        2: "投资策略",
        3: "行业研究",
        4: "晨会早报",
        8: "金工量化",
        9: "债券研究",
        10: "期货研究",
    }
    todaystr = datetime.datetime.today().strftime(r'%Y%m%d')
            
    @classmethod
    def index_price(
        cls, 
        index: str, 
        start: str = None, 
        end: str = None,
    ):
        start = start or '19900101'
        end = end or cls.todaystr
        url = cls.__root + f"index-price?security_code={index}&start={start}&stop={end}"
        res = requests.get(url, headers=cls.headers).json()
        price = pd.DataFrame(res['price'])
        price['date'] = price['date'].astype('datetime64[ns]')
        price = price.set_index('date')
        return price
    
    @classmethod
    def cn_price(
        cls, 
        code: str, 
        start: str = None,
        end: str = None,
    ):
        start = start or '19900101'
        end = end or cls.todaystr
        url = cls.__root + f"cn-price?security_code={code}&start={start}&stop={end}"
        res = requests.get(url, headers=cls.headers).json()
        price = pd.DataFrame(res['price'])
        price['date'] = price['date'].astype('datetime64[ns]')
        price = price.set_index('date')
        return price
    
    @classmethod
    def report_list(
        cls, 
        category: str = 8,
        sub_category: str = 0,
        keyword: str = '', 
        period: str = 'all', 
        org_name: str = '', 
        author: str = '',
        xcf_years: str = '', 
        search_fields: str = 'title',
        page: int = 1, 
        page_size: int = 100
    ):
        '''Get report data in quant block
        ---------------------------------------
        category: str, category to the field, use StockUS.category to see possible choices
        keyword: str, key word to search, default empty string to list recent 100 entries
        period: str, report during this time period
        q: str, search keyword
        org_name: str, search by org_name
        author: str, search by author
        xcf_years: str, search by xcf_years
        search_fields: str, search in fields, support "title", "content", "content_fp"
        page: int, page number
        page_size: int, page size
        '''
        url = cls.__root + 'research/report-list'
        params = (f'?category={category}&dates={period}&q={keyword}&org_name={org_name}'
                  f'&author={author}&xcf_years={xcf_years}&search_fields={search_fields}'
                  f'&page={page}&page_size={page_size}')
        if category != 8:
            params += f'&sub_category={sub_category}'
        headers = {
            "Referer": "https://stock.us/cn/report/quant",
        }
        headers.update(cls.headers)
        url += params
        res = requests.get(url, headers=headers).json()
        data = pd.DataFrame(res['data'])
        data[['pub_date', 'pub_week']] = data[['pub_date', 'pub_week']].astype('datetime64[ns]')
        data.authors = data.authors.map(
            lambda x: ' '.join(list(map(lambda y: y['name'] + ('*' if y['prize'] else ''), x))))
        data = data.set_index('id')
        return data
    
    @classmethod
    def report_search(
        cls, 
        keyword: str = '', 
        period: str = '3m', 
        org_name: str = '', 
        author_name: str = '',
        xcf_years: str = '', 
        search_fields: str = 'title',
        page: int = 1, 
        page_size: int = 100
    ):
        '''Search report in stockus database
        ---------------------------------------
        keyword: str, key word to search, default empty string to list recent 100 entries
        period: str, report during this time period
        org_name: str, search by org_name
        author: str, search by author
        xcf_years: str, search by xcf_years
        search_fields: str, search in fields, support "title", "content", "content_fp"
        page: int, page number
        page_size: int, page size
        '''
        url = cls.__root + 'research/report-search'
        params = (f'?dates={period}&q={keyword}&org_name={org_name}&author_name={author_name}'
                  f'&xcf_years={xcf_years}&search_fields={search_fields}&page={page}'
                  f'&page_size={page_size}')
        url += params
        res = requests.get(url, headers=cls.headers).json()
        data = pd.DataFrame(res['data'])
        data['pub_date'] = data['pub_date'].astype('datetime64[ns]')
        data.authors = data.authors.map(
            lambda x: ' '.join(list(map(lambda y: y['name'] + ('*' if y['prize'] else ''), x)))
            if isinstance(x, list) else '')
        data = data.set_index('id')
        return data

## Cnki crawler

This is a Cnki crawler, temporary support for simple search keyword

In [22]:
import re
import requests
from math import ceil


class Cnki:

    __search_url = "https://kns.cnki.net/KNS8/Brief/GetGridTableHtml"

    @classmethod
    def generic_search(cls, keyword: str, page: int = 3):
        headers = {
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'kns.cnki.net',
            'Origin': 'https://kns.cnki.net',
            'Referer': 'https://kns.cnki.net/kns8/defaultresult/index',
        }
        data = {
            "IsSearch": 'true',
            "QueryJson": '{"Platform":"","DBCode":"SCDB","KuaKuCode":"CJFQ,CDMD,CIPD,CCND,BDZK,CISD,SNAD,CCJD,GXDB_SECTION,CJFN,CCVD,CLKLK","QNode":{"QGroup":[{"Key":"Subject","Title":"","Logic":1,"Items":[{"Title":"主题","Name":"SU","Value":"' + f'{keyword}' + '","Operate":"%=","BlurType":""}],"ChildItems":[]}]}}',
            "PageName": 'DefaultResult',
            "DBCode": 'SCDB',
            "KuaKuCodes": 'CJFQ,CDMD,CIPD,CCND,BDZK,CISD,SNAD,CCJD,GXDB_SECTION,CJFN,CCVD,CLKLK',
            "SearchSql": "0645419CC2F0B23BC604FFC82ADF67C6E920108EDAD48468E8156BA693E89F481391D6F5096D7FFF3585B29E8209A884EFDF8EF1B43B4C7232E120D4832CCC896D30C069E762ACAB990E5EBAAD03C09721B4573440249365A4157D3C93DC874963F6078A465F9A4E6BEED14E5FD119B250F0488206491CF1C7F670020480B48EE2FF3341B3B9C8A0A38F9913EF596174EDD44BBA8277DA2BE793C92DF83782297DE55F70BBF92D5397159D64D1D3DAC96FAD28213BD3E1912A5B4A4AD58E5965CBDBA01069691140F14FD0298FBD1F452C7779EFF17124633292E356C88367122976245AA928FA07D061C0E091BB1136031750CD76D7D64E9D75B7FBAB11CAA5B80183AC60BB0885D2C0A0938C7D1F849656014326473DCB797D5D273C845DAF7FCE49D21478E9B06B77ADE6253ACD4FE1D87EE31B4B2C94E071EE733B3A64EA6EE9CD5F222FCD3DA1D83D9133EF8C9BED9ED3E55DA15F3B4A37C85463B60D2F0BEA46FC7135898D7D93F63AF8B2246716E32B699238901588EE5D1DEF30A01DCE9957CF6934E8B11E273747F9A9BB8ADF535E5E76F6A9386CFBE605748C132DA05E2D31832199B0A4ECF170ACA47154423CF6BBD9607FC505765E95637F93DC865AA738F5EE92B26DB9AF56509A5FC96FF9C3A1720633EBDDC62EC2162E7D5349CAC851ED0AD4E36DCF6FE25EBEAB42BF931DBE3CF4ED1A7BB8FD887C3C33D86B768B0BA7267C4E0E7DEE53D0931F71F07AE13BAFC46034A444EC24C7EA8F0086FAD197A8D2F18C6CBC5DF48050AF8D4C84DE03B9A6F1DF928D63286B1C924B7EC3BA8C2591D60491F95D271F0E7F02AA2AA93C3888B8CCEBB0414BD7145AD15A3166DB4860F85BC476B1B193C219EAE52E33E6BBC9B3AAAD97196977B7DABA36C04093ED723AD874EC6480477C6412B0F589DE6CC7D959855E41265213DCBB4D91238716DF38BF78C951259572F8E5968FAC5C5CDC006DBE919EEB5E5518F51162FCE7CDE520F60093D333FBE121D3164C6D2451F6431FB7973C659E6A9D287B545EC044DE2CBE170F3627719F8418D44E17987CEC7A89B52CB5525AF795DA892475ABF871C3A5A5FCBC5B03EB9BEC8598C8ADD7A68984BBBEF1244DD90386C05756687AB9D87A0B521319C093C3EC0D5EBEFDAB5459E29F1DA03D4C25DE740BF9FA2BC07DD510386E3BBE89F10D45513E29C8CF904763E723CE4BF2928D4DC2A731DD53595E9AACED90679FCDDACED022ECD59D72600A736D555A8B76BFE4CCD861E6A7F5A219EBE9A228BD008928299DB999D18F9CDD2E57E8C03EDF236E62EDB17A1FE5B023CF6E5A11892A5FA17EE5CFE348CA290DC691987A535223133D8CA101E8ABF13EFCAD929635E090B3C6BB6838E33B7C78C1DBA274101A6584300EF8D38C983AD544264217F6793562D19715CD711295C5410C72E88A64BD23D9049E5DF15EA6B3EB4473C1DDEBB416459322FEF0CC61D894476DCD62569527BE23FB7F66DF3F5182ABF2472FB60039CA77218F356D7F82E4EBAAA4C6875B5BD4729C81A29BDF55ED223AA0DAB04E1B248524FC504711360C330186327A780D6487BA831ABE55AAE38E69A0FBEF89D560E7AA26B991966E4B644338863E80AD9D1ACAD459EA933644C5A0D2EA44AD17205AED3BE66AEC01F48BA032EEBD620E2713082FE8D31E4A05A34F18BD389587FA4D3A9DFBB8C16AEE9C5FA9E667BA12A07B757D82F7BB41AC8867D9947CCBA3BB26381EC6D0D3966338DB6FA3D1A61F99A978C3B5ED2B31B7C14D54A4F688C4925C8AF99CB3EE3C2C06C7D35AD891BF0CFC820529FD990F2FF319BE195B1AD23C1667031C072EB1964F8512BB779125E46773C01714FCF0E339AEB0C44FB91B896A7A95AF4F81EB49006B570BC03ECA7D8DA45679F3B46A7AE3B46ED8D319CED49A3A5881A37CD3770703BDF026ACEF7D8662F85AFDBDD36C540FD419E18F30EA0483D24350B7C34C43F3D0065F339EAC15749DF8849F3880378FEA4AD7CCBAA827C828A5CAF7D56E97A87A3FAEEAE136B35FB37E8CE0233D9AF8DEABD47BD5B36A1B42B995D4F96FE744A2E25E9B6107801CACCA0DDC2B7ED5BFD39F68AB2E2BB66AB8286061049F3B5FFE871FFA520A7C0EEE3DEDF417D078DF9013B5F5251A07AE3D4D00B9AF1560200CC981D0E8BE17C9CE204C21E5E543C9E55421D4FCE2C309C68D376E3787AB4640FA99B82988A288FD22A2E0C9225E39A5DAA7EBEB0376912C9CA255A7AE49F3C5AB262B4FFFBA98A9548623C16D0C97C7315DF5FFD1507102EAA730E5247F1C492D49A45121347CFF39A5181729F1D33F28FA48035CBC02CF87DAF72067D70B524421AB21FF137A2C7AB2F90DAD1BA1786C16728E7B78DB0461B5B1E8CF7B88E765E67AF4E458EF3A5125D90DA88CE97D9AB9C4363E4A7D6B7F3B0420B93FEDF72248E076EC0871EDFC5744AC6F9F591CEC4CE3E0E681E1C1B21AFCC5BF5B22116F7E7A3ABA561F68F8AE685DA926756CD70C0E6057C7737537F972F8942CCFD073400F0D5C23F107F55FC07745ED334FB97130860A0B7B0B5B4B2B23417EA63C65BAF1624254BBA167373F1D6C0E0BB5A67F92008CFCA4F24276E725FD05802F94A5CC7E52CC005017C58A8757BDEDED54538DA513E975DFCDC7D3FA95552E960ABA05EB7C33CA37CCA1C93DFF13A493174A9BB3228118E0F2AEBBAEE074D557B6FA6000F0E5C73D563BB8E3598B4D8E94DDCAFEB5BBCDF74D39CCC8AD27A5D3C0CAB59DA24BEB86C10F8584878FA94BE9F1F9D2FA01023A5B838BDCD18C58E4F08C0BF1C31ED25B32438C95D613B5227B0C63CE5B090A49B23416A06BCB9365406EE953CB1245CA00A7791C1F10267F95FD6A5B93F78DBDA6C96F036928F943A8CED955AEF96C63CF849B30EFD0B94BC88E124F1CE2B186D0120F40",
            "CurPage": '1',
            "RecordsCntPerPage": '50',
            "CurDisplayMode": 'listmode',
            "CurrSortField": r'%e5%8f%91%e8%a1%a8%e6%97%b6%e9%97%b4%2f(%e5%8f%91%e8%a1%a8%e6%97%b6%e9%97%b4%2c%27TIME%27)',
            "CurrSortFieldType": 'desc',
            "IsSentenceSearch": 'false',
            "Subject": '',
        }
        results = []
        # first attempt to get result and total pages
        req = requests.post(cls.__search_url, headers=headers, data=data)
        text = req.text
        total = int(re.findall(r'共找到.{0,}?([\d,]+).{0,}?条结果', text)[0].replace(',', ''))
        if page == -1:
            page = ceil(total / 50)
        if page == 1:
            return pd.concat(results, axis=1)
        page = min(page, ceil(total / 50))
        print(f'[+] Current page 1 / {page}')
        result = pd.read_html(text)[0]
        results.append(result)

        # now if the crawl didn't end, it will enter the cycle for crawling
        for p in range(2, page + 1):
            data.update(CurPage=f'{p}', IsSearch='false')
            req = requests.post(cls.__search_url, headers=headers, data=data)
            text = req.text
            print(f'[+] Current page {p} / {page}')
            result = pd.read_html(text)[0]
            results.append(result)
        results = pd.concat(results, axis=0)
        results = results.drop(results.columns[0], axis=1)
        return results

## Weibo Searcher

This is a Weibo Searcher for sumulating the search bar over the weibo website page

In [11]:
import re
import time
import random
import requests
import pandas as pd
from tqdm import tqdm
from urllib.parse import quote


class WeiboSearch:
    '''A search crawler engine for weibo
    ====================================
    sample usage:
    >>> result = WeiboSearch.search("keyword")
    '''

    __base = "https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D{}&page_type=searchall&page={}"

    @classmethod
    def _get_content(cls, url, headers):

        def _parse(mblog):
            blog = {
                "created_at": mblog["created_at"],
                "text": re.sub(r'<(.*?)>', '', mblog['text']),
                "id": mblog["id"],
                "link": f"https://m.weibo.cn/detail/{mblog['id']}",                    
                "source": mblog["source"],
                "username": mblog["user"]["screen_name"],
                "reposts_count": mblog["reposts_count"],
                "comments_count": mblog["comments_count"],
                "attitudes_count": mblog["attitudes_count"],
                "isLongText": mblog["isLongText"],
            }
            if blog["isLongText"]:
                headers = {
                    "Referer": f"https://m.weibo.cn/detail/{blog['id']}",
                    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.2 Safari/605.1.15"
                }
                resp = requests.get(f"https://m.weibo.cn/statuses/extend?id={blog['id']}", headers=headers).json()
                blog["full_text"] = resp["data"]["longTextContent"]
            return blog

        # First try to get resources
        res = requests.get(url, headers=headers).json()
        # if it is end
        if res.get("msg"):
            return False

        # if it contains cards
        cards = res["data"]["cards"]
        blogs = []
        for card in cards:
            # find 'mblog' tag and append to result blogs
            mblog = card.get("mblog")
            card_group = card.get("card_group")
            if card.get("mblog"):
                blog = _parse(mblog)
                blogs.append(blog)
            elif card_group:
                for cg in card_group:
                    mblog = cg.get("mblog")
                    if mblog:
                        blog = _parse(mblog)
                        blogs.append(blog)
        return blogs
    
    @classmethod
    def _get_full(cls, keyword: str):
        page = 1
        result = []
        headers = {
            "Referer": f"https://m.weibo.cn/search?containerid=100103type%3D1%26q%3D{quote(keyword, 'utf-8')}",
            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"
            }
        print(f"Start in keyword: {keyword}")
        while True:
            print(f"Getting {keyword}, currently at page: {page} ... ")
            url = cls.__base.format(keyword, page)
            blogs = cls._get_content(url, headers)
            if not blogs:
                break
            result.extend(blogs)
            page += 1
            time.sleep(random.randint(5, 8))
        print(f"Finished in keyword: {keyword}!")
        return result
    
    @classmethod
    def _get_assigned(cls, keyword: str, pages: int):
        result = []
        print(f"Start in keyword: {keyword}")
        headers = {
            "Referer": f"https://m.weibo.cn/search?containerid=100103type%3D1%26q%3D{quote(keyword, 'utf-8')}",
            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"
            }
        for page in tqdm(range(1, pages+1)):
            print(f"Getting {keyword}, currently at page: {page} ... ")
            url = cls.__base.format(keyword, page)
            blogs = cls._get_content(url, headers)
            result.extend(blogs)
            time.sleep(random.randint(5, 8))
        print(f"Finished in keyword: {keyword}!")
        return result          
    
    @classmethod
    def search(cls, keyword: str, pages: int = -1):
        """Search for the keyword
        --------------------------
        
        keyword: str, keyword
        pages: int, how many pages you want to get, default -1 to all pages
        """

        keyword = keyword.replace('#', '%23')
        if pages == -1:
            result = cls._get_full(keyword)
        else:
            result = cls._get_assigned(keyword, pages)
        result = pd.DataFrame(result)
        return result

## Weibo Google Api for Hot Topic Trend

This is a hot topic api trend fetcher from google-api, and supporting get the trend data on a given topic or on a given date, returning the specified hot topic trend data.

In [16]:
from bs4 import BeautifulSoup


class HotTopic:
    """A Second Level Crawler for Hot Topic
    ========================================
    sample usage:
    >>> result = HotTopic.search('keyword')
    """

    __list = "https://google-api.zhaoyizhe.com/google-api/index/mon/list"
    __search = "https://google-api.zhaoyizhe.com/google-api/index/mon/sec?isValid=ads&keyword={}"
    __trend = "https://google-api.zhaoyizhe.com/google-api/index/superInfo?keyword={}"
    
    @classmethod
    def search(cls, keyword: str = None, date: str = None):
        if keyword is None and date is None:
            url = cls.__list
        elif keyword is None and date is not None:
            url = cls.__search.format(date)
        elif keyword is not None and date is None:
            url = cls.__search.format(keyword)
        result = requests.get(url).json()
        data = result["data"]
        data = pd.DataFrame(data)
        data = data.drop("_id", axis=1)
        return data

    @classmethod
    def trend(cls, keyword: str):
        url = cls.__trend.format(keyword)
        result = requests.get(url).json()
        data = pd.DataFrame(map(lambda x: x['value'], result), 
            columns=['datetime', 'hot', 'tag']).set_index('datetime')
        return data

    @classmethod
    def trend_history(cls, keyword: str, freq: str = '3m'):
        if freq not in ['1h', '24h', '1m', '3m']:
            raise ValueError('Freq parameter must be in ["1h", "24h', "1m", "3m]")
        if freq.endswith('h'):
            freq += 'our'
        elif freq.endswith('m'):
            freq += 'onth'
        url = "https://data.weibo.com/index/ajax/newindex/searchword"
        data = {
            "word": f"{keyword}"
        }
        headers = {
            "Host": "data.weibo.com",
            "Origin": "https://data.weibo.com",
            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.16(0x18001041) NetType/WIFI Language/zh_CN",
            "Content-Length": "23",
            "Referer": "https://data.weibo.com/index/newindex?visit_type=search"
        }
        html = requests.post(url, data=data, headers=headers)
        html = BeautifulSoup(html.text, 'html.parser')
        res = html.find_all('li')
        wids = [(r.attrs["wid"].strip(r'\"'), eval('"' + r.attrs["word"].replace(r'\"', '') + '"')) for r in res]

        url = "https://data.weibo.com/index/ajax/newindex/getchartdata"
        results = []
        for wid in wids:
            post_params = {
                "wid": wid[0],
                "dateGroup": freq
            }
            res = requests.post(url, data=post_params, headers=headers).json()
            data = res["data"]
            index = data[0]["trend"]['x']
            index = list(map(lambda x: x.replace("月", '-').replace("日", ''), index))
            volume = data[0]["trend"]['s']
            result = pd.Series(volume, index=index, name=wid[1])
            results.append(result)
        results = pd.concat(results, axis=1)
        return results

## BackTrader Fast Strategy

A subclass of `bt.Strategy` with some frequently used method

In [20]:
import datetime
import backtrader as bt


class Strategy(bt.Strategy):

    def log(self, text: str, datetime: datetime.datetime = None, hint: str = 'INFO'):
        datetime = datetime or self.data.datetime.date(0)
        print(f'[{hint}] {datetime}: {text}')

    def notify_order(self, order: bt.Order):
        if order.status in [order.Submitted, order.Accepted, order.Created]:
            return

        elif order.status in [order.Completed]:
            self.log(f'Trade <{order.executed.size}> <{order.info.get("name", "data")}> at <{order.executed.price:.2f}>')
            self.bar_executed = len(self)

        elif order.status in [order.Canceled, order.Margin, order.Rejected, order.Expired]:
            self.log('Order canceled, margin, rejected or expired', hint='WARN')

        self.order = None

## Notes

### pandas.concat

Do not use `pd.concat` with parameter `axis=1` for large dataset for its low speed, try to use `pd.concat([df1, df2], axis=0, keys=['keyname1', 'keyname2'], names=['idx_level0_name', 'idx_level1_name'])` instead. Concatenating on index or just merge consumes a lot of resources.

For more information, please refer to [Official Document](https://pandas.pydata.org/docs/reference/api/pandas.concat.html)