# Eastmoney Collector

## Stock Emotion Collector for Guba

In [1]:
import re
import time
import random
import hashlib
import datetime
import requests
import pandas as pd
import akshare as ak
from tqdm import tqdm
from lxml import etree
from diskcache import Cache
from functools import wraps
from dask.multiprocessing import get

### Some Utility Functions

In [2]:
def cache_wrapper(directory: str = '../data/cache/', expire: int = 3600):
    cache = Cache(directory=directory)
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            key = func.__name__ + ':' + hashlib.md5((func.__name__ + str(args) + str(kwargs)).encode('utf-8')).hexdigest()
            result = cache.get(key=key)
            if result is not None:
                return result
            result = func(*args, **kwargs)
            cache.set(key=key, value=result, expire=expire)
            return result
        return wrapper
    return decorator

def wrap_code(code: str, formatstr: str = '{market}.{code}'):
    if not code.isdigit():
        raise ValueError('It seems your code has already been wrapped')
    sh_code_pat = '6\d{5}|9\d{5}'
    sz_code_pat = '0\d{5}|2\d{5}|3\d{5}'
    bj_code_pat = '8\d{5}|4\d{5}'
    if re.match(sh_code_pat, code):
        return formatstr.format(code=code, market='sh')
    if re.match(sz_code_pat, code):
        return formatstr.format(code=code, market='sz')
    if re.match(bj_code_pat, code):
        return formatstr.format(code=code, market='bj')
    
@cache_wrapper(expire=30 * 24 * 3600)
def get_proxy(page_size: int = 20):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
    }
    url_list = [f'https://free.kuaidaili.com/free/inha/{i}/' for i in range(1, page_size + 1)]
    proxies = []
    for url in url_list:
        data = pd.read_html(url)[0][['IP', 'PORT', '类型']].drop_duplicates()
        print(f'[+] {url} Get Success!')
        data['类型'] = data['类型'].str.lower()
        proxy = (data['类型'] + '://' + data['IP'] + ':' + data['PORT'].astype('str')).to_list()
        proxies += list(map(lambda x: {x.split('://')[0]: x}, proxy))
        time.sleep(0.8)
    available_proxies = []
    
    for proxy in proxies:
        try:
            res = requests.get('https://www.baidu.com', 
                headers=headers, proxies=proxy, timeout=1)
            res.raise_for_status()
            available_proxies.append(proxy)
        except Exception as e:
            print(str(e))
    
    print(f'[=] Get {len(proxies)} proxies, while {len(available_proxies)} are available. '
        f'Current available rate is {len(available_proxies) / len(proxies) * 100:.2f}%')
    return proxies

def proxy_request(
    url: str, 
    proxies: 'dict | list', 
    retry: int = None, 
    timeout: int = 1,
    delay: int = 0,
    verbose: bool = True,
    **kwargs
):
    if isinstance(proxies, dict):
        proxies = [proxies]
    retry = retry or len(proxies)
    random.shuffle(proxies) 
    for try_times, proxy in enumerate(proxies):
        if try_times + 1 <= retry:
            try:
                response = requests.get(url, proxies=proxy, timeout=timeout, **kwargs)
                response.raise_for_status()
                if verbose:
                    print(f'[+] {url}, try {try_times + 1}/{retry}')
                return response
            except Exception as e:
                if verbose:
                    print(f'[-] [{e}] {url}, try {try_times + 1}/{retry}')
                time.sleep(delay)

In [3]:
today = datetime.datetime.today().date()

Use the utility function `get_proxy` to get some proxies for later use.

In [4]:
proxies = get_proxy()

### Crawl for the Web

This part is dedicated to crawl the up or down possibility provided by investers

In [5]:
def crawl_stock(code: str):
    today = datetime.datetime.today().date()
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6 Safari/605.1.15",
        "Referer": "http://guba.eastmoney.com/",
        "Host": "gubacdn.dfcfw.com"
    }
    code = wrap_code(code, '{market}{code}')
    url = f"http://gubacdn.dfcfw.com/LookUpAndDown/{code}.js"
    res = requests.get(url, headers=headers)
    res.raise_for_status()
    res = eval(res.text.strip('var LookUpAndDown=').replace('null', f'"{today}"'))
    data = pd.Series(res['Data'])
    data['code'] = code
    return data

In [None]:
codes = ak.stock_zh_a_spot_em()['代码'].to_list()
dsk = dict(zip(['result:' + code for code in codes], [(crawl_stock, code) for code in codes]))
datas = get(dsk, list(dsk.keys()))
data = pd.concat(datas, axis=1).T
data.Date = pd.to_datetime(data.Date)
data = data.set_index('code')
data = data.astype({"TapeZ": "float32", "TapeD": "float32", "TapeType": "uint8", "Date": "datetime64[ns]"})
data.to_parquet(f'../data/derivative-indicators/guba-votes/{today}.parquet')

### Crawl the Comments

In [36]:
def overview(code: str, page: int, try_times: int = 100):
    tries = 0
    while tries <= try_times:
        page = str(page)
        url = f"http://guba.eastmoney.com/list,{code},f_{page}.html"
        html = etree.HTML(proxy_request(url, proxies=proxies, verbose=False).text)
        read = html.xpath('//*[@id="articlelistnew"]/div[not(@class="dheader")]/span[1]/text()')
        comments = html.xpath('//*[@id="articlelistnew"]/div[not(@class="dheader")]/span[2]/text()')
        title = html.xpath('//*[@id="articlelistnew"]/div[not(@class="dheader")]/span[3]/a/text()')
        href = html.xpath('//*[@id="articlelistnew"]/div[not(@class="dheader")]/span[3]/a/@href')
        author = html.xpath('//*[@id="articlelistnew"]/div[not(@class="dheader")]/span[4]/a/font/text()')
        time = html.xpath('//*[@id="articlelistnew"]/div[not(@class="dheader")]/span[5]/text()')
        # we might still got unaligned data because the anti-crawl system
        try:
            data = pd.DataFrame({"read": read, "comments": comments, "title": title, "href": href, "author": author, "datetime": time})
            return data
        except:
            tries += 1
            print(f"try time {tries} for page {page} aligned failed, that's probably because the proxy ip is banned.")

In [37]:
results = []
end_page = 500
dsk = dict(zip(
    [f'page{i}' for i in range(1, end_page + 1)],
    [(overview, 'zssh000001', i) for i in range(1, end_page + 1)],
))
results = get(dsk, list(dsk.keys()))

try time 1 aligned failed, that's probably because the proxy ip is banned.
try time 1 aligned failed, that's probably because the proxy ip is banned.
try time 1 aligned failed, that's probably because the proxy ip is banned.
try time 1 aligned failed, that's probably because the proxy ip is banned.
try time 1 aligned failed, that's probably because the proxy ip is banned.
try time 1 aligned failed, that's probably because the proxy ip is banned.


In [None]:
today = datetime.datetime.today()
year = today.year
month = today.month
for res in results:
    res['datetime'] = res['datetime'].map(lambda x: (str(year) if int(x[:2]) <= month else str(year - 1)) + '-' + x)
    res['datetime'] = pd.to_datetime(res['datetime'])
    if (res['datetime'] < str(year)).any():
        year -= 1
result = pd.concat(results, axis=0)
result = result.astype({"read": "uint16", "comments": "uint16"})
result.to_parquet(f'../data/derivative-indicators/guba-comments/{today.date}.parquet')

## Stock Financial Report Collector

In [1]:
import akshare as ak

In [2]:
data = ak.stock_balance_sheet_by_report_em(symbol='SH600519')

                                               

In [9]:
data.columns[data.astype('f8', errors='ignore').dtypes != 'object']

Index(['ACCEPT_DEPOSIT_INTERBANK', 'ACCOUNTS_PAYABLE', 'ACCOUNTS_RECE',
       'ACCRUED_EXPENSE', 'ADVANCE_RECEIVABLES', 'AGENT_TRADE_SECURITY',
       'AGENT_UNDERWRITE_SECURITY', 'AMORTIZE_COST_FINASSET',
       'AMORTIZE_COST_FINLIAB', 'AMORTIZE_COST_NCFINASSET',
       ...
       'TOTAL_OTHER_RECE_YOY', 'TOTAL_PARENT_EQUITY_YOY',
       'TRADE_FINASSET_NOTFVTPL_YOY', 'TRADE_FINASSET_YOY',
       'TRADE_FINLIAB_NOTFVTPL_YOY', 'TRADE_FINLIAB_YOY',
       'TREASURY_SHARES_YOY', 'UNASSIGN_RPOFIT_YOY',
       'UNCONFIRM_INVEST_LOSS_YOY', 'USERIGHT_ASSET_YOY'],
      dtype='object', length=304)

In [16]:
data.REPORT_DATE_NAME

0      2022中报
1     2022一季报
2      2021年报
3     2021三季报
4      2021中报
       ...   
82     2001年报
83     2001中报
84     2000年报
85     1999年报
86     1998年报
Name: REPORT_DATE_NAME, Length: 87, dtype: object