# Eastmoney Collector

## Stock Emotion Collector for Guba

In [1]:
import re
import time
import random
import hashlib
import datetime
import requests
import pandas as pd
import akshare as ak
from tqdm import tqdm
from lxml import etree
from functools import wraps
from joblib import Parallel, delayed, Memory

### Some Utility Functions

In [2]:
def format_code(code, format_str='{market}.{code}'):
    """Format stock code with ``format_str``
    
    code: str, the code to be formatted
    format_str: str, the format to be transformed
    return: str, the formated code string
    """

def get_proxy(page_size=20):
    """Get proxy data on https://free.kuaidaili.com
    
    page_sze: int, the number of pages to collect
    
    return: list[dict]: the proxy list
    """

def proxy_request(
    url: str, 
    proxies: 'dict | list', 
    retry: int = None, 
    timeout: int = 1,
    delay: int = 0,
    verbose: bool = True,
    **kwargs
):
    """Start a request using proxies, keep retrying until success
    
    url: str, the target url link,
    proxies: list or dict, the proxies to use
    retry: int, maximum time to try, default to the length of the proxies list
    timeout: int, time limit for every try
    delay: int, how long to wait after failure
    verbose: bool, whether to show detail information
    **kwargs: the keyword arguments passed to ``request.get``
    return: the response
    """

def get_stock_code():
    """Get all available stock code in current market"""

In [3]:
%run ../mylib/utils.py

In [4]:
today = datetime.datetime.today().date()

Use the utility function `get_proxy` to get some proxies for later use.

In [5]:
mem = Memory('/tmp/joblib_test/')
proxies = mem.cache(get_proxy)()

________________________________________________________________________________
[Memory] Calling __main__--Users-oak-Desktop-Quant-mylib-utils.get_proxy...
get_proxy()
[+] https://free.kuaidaili.com/free/inha/1/ Get Success!
[+] https://free.kuaidaili.com/free/inha/2/ Get Success!
[+] https://free.kuaidaili.com/free/inha/3/ Get Success!
[+] https://free.kuaidaili.com/free/inha/4/ Get Success!
[+] https://free.kuaidaili.com/free/inha/5/ Get Success!
[+] https://free.kuaidaili.com/free/inha/6/ Get Success!
[+] https://free.kuaidaili.com/free/inha/7/ Get Success!
[+] https://free.kuaidaili.com/free/inha/8/ Get Success!
[+] https://free.kuaidaili.com/free/inha/9/ Get Success!
[+] https://free.kuaidaili.com/free/inha/10/ Get Success!
[+] https://free.kuaidaili.com/free/inha/11/ Get Success!
[+] https://free.kuaidaili.com/free/inha/12/ Get Success!
[+] https://free.kuaidaili.com/free/inha/13/ Get Success!
[+] https://free.kuaidaili.com/free/inha/14/ Get Success!
[+] https://free.kuaidaili.c

### Crawl for the Web

This part is dedicated to crawl the up or down possibility provided by investers

In [6]:
def crawl_stock(code: str):
    today = datetime.datetime.today().date()
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6 Safari/605.1.15",
        "Referer": "http://guba.eastmoney.com/",
        "Host": "gubacdn.dfcfw.com"
    }
    code = format_code(code, '{market}{code}')
    url = f"http://gubacdn.dfcfw.com/LookUpAndDown/{code}.js"
    res = requests.get(url, headers=headers)
    res.raise_for_status()
    res = eval(res.text.strip('var LookUpAndDown=').replace('null', f'"{today}"'))
    data = pd.Series(res['Data'])
    data['code'] = code
    return data

In [12]:
codes = ak.stock_zh_a_spot_em()['代码'].to_list()
datas = Parallel(n_jobs=8)(delayed(crawl_stock)(code) for code in codes)
data = pd.concat(datas, axis=1).T
data.Date = pd.to_datetime(data.Date)
data = data.set_index('code')
data = data.astype({"TapeZ": "float32", "TapeD": "float32", "TapeType": "uint8", "Date": "datetime64[ns]"})
data.to_parquet(f'../data/derivative_indicators/guba_votes/{today}.parquet')

### Crawl the Comments

In [13]:
def overview(code: str, page: int, try_times: int = 100):
    tries = 0
    while tries <= try_times:
        page = str(page)
        url = f"http://guba.eastmoney.com/list,{code},f_{page}.html"
        html = etree.HTML(proxy_request(url, proxies=proxies, verbose=False).text)
        read = html.xpath('//*[@id="articlelistnew"]/div[not(@class="dheader")]/span[1]/text()')
        comments = html.xpath('//*[@id="articlelistnew"]/div[not(@class="dheader")]/span[2]/text()')
        title = html.xpath('//*[@id="articlelistnew"]/div[not(@class="dheader")]/span[3]/a/text()')
        href = html.xpath('//*[@id="articlelistnew"]/div[not(@class="dheader")]/span[3]/a/@href')
        author = html.xpath('//*[@id="articlelistnew"]/div[not(@class="dheader")]/span[4]/a/font/text()')
        time = html.xpath('//*[@id="articlelistnew"]/div[not(@class="dheader")]/span[5]/text()')
        # we might still got unaligned data because the anti-crawl system
        try:
            data = pd.DataFrame({"read": read, "comments": comments, "title": title, "href": href, "author": author, "datetime": time})
            return data
        except:
            tries += 1
            print(f"try time {tries} for page {page} aligned failed, that's probably because the proxy ip is banned.")

In [15]:
results = []
end_page = 500
results = Parallel(n_jobs=8)(delayed(overview)('zssh000001', i) for i in range(1, end_page + 1))

try time 1 for page 137 aligned failed, that's probably because the proxy ip is banned.


In [29]:
today = datetime.datetime.today()
year = today.year
month = today.month
for res in results:
    res['datetime'] = res['datetime'].map(lambda x: (str(year) if int(x[:2]) <= month else str(year - 1)) + '-' + x)
    res['datetime'] = pd.to_datetime(res['datetime'])
    if (res['datetime'] < str(year)).any():
        year -= 1
result = pd.concat(results, axis=0)
result = result.astype({"read": "uint16", "comments": "uint16"})
result[['read', 'comments']] = result[['read', 'comments']].replace({'万': '0000', '\.': ''}, regex=True).astype('uint16')
result.to_parquet(f'../data/derivative_indicators/guba_comments/{today.date}.parquet')

## Stock Financial Report Collector

In [None]:
def get_balance_sheet(code):
    """Only returns numerical balance_sheet_data"""
    try:
        data = ak.stock_balance_sheet_by_report_em(symbol=code)
        if data.empty:
            return None
        data = data.loc[:, data.columns[data.astype('f8', 
            errors='ignore').dtypes != 'object'].union(['SECUCODE', 'REPORT_DATE', 'NOTICE_DATE'])]
        data = data.replace({None: np.nan})
        data = data.astype('f8', errors='ignore')
        data[['REPORT_DATE', 'NOTICE_DATE']] = data[['REPORT_DATE', 'NOTICE_DATE']].astype('datetime64[ns]')
        data = data.set_index('REPORT_DATE')
        data = data.reindex(pd.date_range(data.index.min(), data.index.max(), freq='q'))
        data.index.name = "REPORT_DATE"
        data = data.reset_index()
        data['SECUCODE'] = data['SECUCODE'][~data['SECUCODE'].isna()].iloc[0]
        return data
    except:
        return None

In [361]:
codes = get_stock_code()
joblibres = Parallel(n_jobs=8)(delayed(get_balance_sheet)(code) for code in codes)

                                               