### 解析tushare 的trading模块中的get_hist_data函数

In [1]:
# -*- coding:utf-8 -*- 
"""
交易数据接口 
Created on 2014/07/31
@author: Jimmy Liu
@group : waditu
@contact: jimmysoa@sina.cn
"""
from __future__ import division

import time
import json
import lxml.html
from lxml import etree
import pandas as pd
import numpy as np
import datetime
import tushare
from tushare.stock import cons as ct
import re
from pandas.compat import StringIO
from tushare.util import dateu as du
from tushare.util.formula import MA
import os
from tushare.util.conns import get_apis, close_apis
from tushare.stock.fundamental import get_stock_basics
try:
    from urllib.request import urlopen, Request
except ImportError:
    from urllib2 import urlopen, Request

In [3]:
def get_hist_data(code=None, start=None, end=None,
                  ktype='D', retry_count=3,
                  pause=0.001):
    """
        获取个股历史交易记录
    Parameters
    ------
      code:string
                  股票代码 e.g. 600848
      start:string
                  开始日期 format：YYYY-MM-DD 为空时取到API所提供的最早日期数据
      end:string
                  结束日期 format：YYYY-MM-DD 为空时取到最近一个交易日数据
      ktype：string
                  数据类型，D=日k线 W=周 M=月 5=5分钟 15=15分钟 30=30分钟 60=60分钟，默认为D
      retry_count : int, 默认 3
                 如遇网络等问题重复执行的次数 
      pause : int, 默认 0
                重复请求数据过程中暂停的秒数，防止请求间隔时间太短出现的问题
    return
    -------
      DataFrame
          属性:日期 ，开盘价， 最高价， 收盘价， 最低价， 成交量， 价格变动 ，涨跌幅，5日均价，10日均价，20日均价，5日均量，10日均量，20日均量，换手率
    """
    symbol = ct._code_to_symbol(code)
    url = ''
    if ktype.upper() in ct.K_LABELS:
        url = ct.DAY_PRICE_URL%(ct.P_TYPE['http'], ct.DOMAINS['ifeng'],
                                ct.K_TYPE[ktype.upper()], symbol)
    elif ktype in ct.K_MIN_LABELS:
        url = ct.DAY_PRICE_MIN_URL%(ct.P_TYPE['http'], ct.DOMAINS['ifeng'],
                                    symbol, ktype)
    else:
        raise TypeError('ktype input error.')
    
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(url)
            lines = urlopen(request, timeout = 10).read()
            if len(lines) < 15: #no data
                return None
        except Exception as e:
            print(e)
        else:
            js = json.loads(lines.decode('utf-8') if ct.PY3 else lines)
            cols = []
            if (code in ct.INDEX_LABELS) & (ktype.upper() in ct.K_LABELS):
                cols = ct.INX_DAY_PRICE_COLUMNS
            else:
                cols = ct.DAY_PRICE_COLUMNS
            if len(js['record'][0]) == 14:
                cols = ct.INX_DAY_PRICE_COLUMNS
            df = pd.DataFrame(js['record'], columns=cols)
            if ktype.upper() in ['D', 'W', 'M']:
                df = df.applymap(lambda x: x.replace(u',', u''))
                df[df==''] = 0
            for col in cols[1:]:
                df[col] = df[col].astype(float)
            if start is not None:
                df = df[df.date >= start]
            if end is not None:
                df = df[df.date <= end]
            if (code in ct.INDEX_LABELS) & (ktype in ct.K_MIN_LABELS):
                df = df.drop('turnover', axis=1)
            df = df.set_index('date')
            df = df.sort_index(ascending = False)
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG)

In [4]:
code='603987'
start=None
end=None,
ktype='D'
retry_count=3
pause=0.001

symbol = ct._code_to_symbol(code)
print(symbol)

sh603987


In [5]:
url = ''
if ktype.upper() in ct.K_LABELS:
    url = ct.DAY_PRICE_URL%(ct.P_TYPE['http'], ct.DOMAINS['ifeng'],
                                ct.K_TYPE[ktype.upper()], symbol)
elif ktype in ct.K_MIN_LABELS:
    url = ct.DAY_PRICE_MIN_URL%(ct.P_TYPE['http'], ct.DOMAINS['ifeng'],
                                    symbol, ktype)
else:
    raise TypeError('ktype input error.')

In [6]:
url

'http://api.finance.ifeng.com/akdaily/?code=sh603987&type=last'

In [7]:
#后复权
request = Request(ct.HIST_FQ_FACTOR_URL%(ct.P_TYPE['http'],
                                             ct.DOMAINS['vsf'], symbol))
ct.HIST_FQ_FACTOR_URL%(ct.P_TYPE['http'],ct.DOMAINS['vsf'], symbol)

'http://vip.stock.finance.sina.com.cn/api/json.php/BasicStockSrv.getStockFuQuanData?symbol=sh603987&type=hfq'

In [8]:
text = urlopen(request, timeout=10).read()
#print(text)

In [9]:
text = text.decode(encoding='utf-8')
text = text[1:len(text)-1]
#text = text.decode('utf-8') if ct.PY3 else text
text = text.replace('{_', '{"')
text = text.replace('total', '"total"')
text = text.replace('data', '"data"')
text = text.replace(':"', '":"')
text = text.replace('",_', '","')
text = text.replace('_', '-')
text = json.loads(text)

In [30]:
#print(text)

In [33]:
df = pd.DataFrame({'date':list(text['data'].keys()), 'factor':list(text['data'].values())})
print(df)

            date   factor
0     2005-05-20   2.5494
1     1994-02-03   5.8500
2     2003-11-27   7.9225
3     2003-02-19   9.1003
4     2001-02-21   9.7618
5     2003-01-22   9.1003
6     2011-07-13  23.3048
7     2012-03-05  14.6607
8     1998-07-08  11.3915
9     1998-04-02  11.2302
10    1994-06-16   3.7800
11    2004-10-28   4.8729
12    2018-07-16  27.6583
13    1997-11-12   9.2456
14    2001-03-09  10.0522
15    2005-08-01   2.5171
16    2003-09-19   8.6808
17    2013-07-01  10.0721
18    1997-09-09  10.3105
19    1994-08-08   4.6561
20    2017-05-04  22.2387
21    1994-10-13   5.7940
22    2004-05-26   6.6962
23    2009-12-09  27.2266
24    2010-01-15  26.4840
25    2004-09-01   6.6801
26    2018-03-02  28.6245
27    2010-09-16  30.9394
28    2004-03-19   8.8583
29    2006-10-19   5.8833
...          ...      ...
5776  2006-12-19   5.6168
5777  2001-01-09  11.9239
5778  2010-04-23  35.4517
5779  2009-06-18  15.1555
5780  2013-05-20  11.8238
5781  1995-07-24   4.6309
5782  2002-0

In [34]:
start,end=None,None
start = du.today_last_year() if start is None else start
end = du.today() if end is None else end
qs = du.get_quarts(start, end)
qt = qs[0]
ct._write_head()


[Getting data:]

In [35]:
index=False;retry_count=3;pause=0.01
url=_get_index_url(index, code, qt)
#data = _parse_fq_data(_get_index_url(index, code, qt), index,retry_count, pause)

In [36]:
url

'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_FuQuanMarketHistory/stockid/000546.phtml?year=2018&jidu=4'

In [37]:
request = Request(url)
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')

In [38]:
html = lxml.html.parse(StringIO(text))

In [39]:
res = html.xpath('//table[@id=\"FundHoldSharesTable\"]')

In [40]:
if ct.PY3:
    sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
    sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)

In [41]:
df = pd.read_html(sarr, skiprows = [0, 1])[0]

In [42]:
if len(df) == 0:
      pd.DataFrame()
if index:
     df.columns = ct.HIST_FQ_COLS[0:7]
else:
     df.columns = ct.HIST_FQ_COLS
if df['date'].dtypes == np.object:
     df['date'] = df['date'].astype(np.datetime64)
df = df.drop_duplicates('date')

  return self.apply('astype', dtype=dtype, **kwargs)


In [43]:
data = _parse_fq_data(url, index,retry_count, pause)

  return self.apply('astype', dtype=dtype, **kwargs)


In [44]:
if index:
        url = ct.HIST_INDEX_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                             code, qt[0], qt[1])
else:
        url = ct.HIST_FQ_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                              code, qt[0], qt[1])

In [45]:
url

'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_FuQuanMarketHistory/stockid/000546.phtml?year=2018&jidu=4'

In [46]:
if data is None:
     data = pd.DataFrame()
if len(qs)>1:
    for d in range(1, len(qs)):
        qt = qs[d]
        ct._write_console()
        df = _parse_fq_data(_get_index_url(index, code, qt), index,
                                retry_count, pause)

##

  return self.apply('astype', dtype=dtype, **kwargs)


##

In [47]:
import os
import re
import numbers
import collections
import warnings

from distutils.version import LooseVersion

import numpy as np

from pandas.io.common import _is_url, urlopen, parse_url
from pandas.io.parsers import TextParser
from pandas.compat import (lrange, lmap, u, string_types, iteritems,
                           raise_with_traceback, binary_type)
from pandas.core import common as com
from pandas import Series


try:
    import bs4
except ImportError:
    _HAS_BS4 = False
else:
    _HAS_BS4 = True


try:
    import lxml
except ImportError:
    _HAS_LXML = False
else:
    _HAS_LXML = True


try:
    import html5lib
except ImportError:
    _HAS_HTML5LIB = False
else:
    _HAS_HTML5LIB = True

In [48]:
_HAS_LXML

True

In [49]:
_HAS_BS4

True

In [50]:
_HAS_HTML5LIB

True

In [51]:
import bs4

In [52]:
import html5lib

### 任务

寻找连续N天上涨的标的，起点3天；上涨定义为收盘价大于开盘价