In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
cd /content/drive/MyDrive/# DS Capstone

/content/drive/.shortcut-targets-by-id/1gqWXVXxyVPSoXJrYLRDV3lFM-rCcuLuy/# DS Capstone


In [26]:
cd getting_data

/content/drive/.shortcut-targets-by-id/1gqWXVXxyVPSoXJrYLRDV3lFM-rCcuLuy/# DS Capstone/getting_data


In [27]:
pip install baostock



In [29]:
# 处理数据
import numpy as np
import pandas as pd
# 获取股票数据
import baostock as bs

In [30]:
def get_data(
        code="sh.000001", start_date="2003-06-01", end_date="2023-06-01", frequency="d"
):
    #### 登陆baostock系统 ####
    lg = bs.login()
    # 显示登陆返回信息
    print("登录响应代码:" + lg.error_code)
    print("登录响应信息:" + lg.error_msg)

    data = bs.query_history_k_data_plus(
        code,
        fields="date,code,open,high,low,close,volume",
        start_date=start_date,
        end_date=end_date,
        frequency="d",
        adjustflag="3",
    )
    data_list = []
    while (data.error_code == "0") & data.next():
        # 获取一条记录，将记录合并在一起
        data_list.append(data.get_row_data())
    # 转为dataframe
    result = pd.DataFrame(data_list, columns=data.fields)

    # 警惕：数据全是字符串：<class 'str'>
    # 把字符串转为数值
    result.open = result.open.astype("float64").round(2)
    result.close = result.close.astype("float64").round(2)
    result.high = result.high.astype("float64").round(2)
    result.low = result.low.astype("float64").round(2)
    result.volume = result.volume.astype("int")
    # date列转为时间类型
    result.date = pd.DatetimeIndex(result.date)

    # dataframe规范化
    df = pd.DataFrame(
        {
            "date":result["date"].values,
            "open": result["open"].values,
            "close": result["close"].values,
            "high": result["high"].values,
            "low": result["low"].values,
            "volume": result["volume"].values,
        }
    )

    #### 登出系统 ####
    bs.logout()
    return df

In [31]:
# 获取日线数据
data_20 = get_data(code="sh.000001", start_date="2003-06-01", end_date="2023-06-01", frequency="d")
data_10 = get_data(code="sh.000001", start_date="2013-06-01", end_date="2023-06-01", frequency="d")

login success!
登录响应代码:0
登录响应信息:success
logout success!
login success!
登录响应代码:0
登录响应信息:success
logout success!


In [32]:
data_20

Unnamed: 0,date,open,close,high,low,volume
0,2003-06-02,1577.99,1576.53,1578.87,1564.61,1166830300
1,2003-06-03,1577.08,1562.24,1582.41,1561.08,1110301000
2,2003-06-04,1560.67,1574.11,1574.77,1546.62,1427534300
3,2003-06-05,1575.31,1556.25,1575.64,1555.45,1184987300
4,2003-06-06,1553.20,1538.92,1557.76,1538.10,1052004000
...,...,...,...,...,...,...
4859,2023-05-26,3198.88,3212.50,3220.77,3179.82,26170047900
4860,2023-05-29,3219.76,3221.45,3230.97,3208.96,29738948700
4861,2023-05-30,3214.67,3224.21,3230.63,3186.63,30353246000
4862,2023-05-31,3214.66,3204.56,3216.36,3190.27,29457333800


In [33]:
data_10

Unnamed: 0,date,open,close,high,low,volume
0,2013-06-03,2300.21,2299.25,2313.43,2294.11,9688281344
1,2013-06-04,2297.10,2272.42,2297.10,2264.76,10189581800
2,2013-06-05,2270.71,2270.93,2276.86,2260.87,7648330300
3,2013-06-06,2264.43,2242.11,2266.69,2240.07,8153869400
4,2013-06-07,2242.26,2210.90,2250.63,2205.07,8997807700
...,...,...,...,...,...,...
2428,2023-05-26,3198.88,3212.50,3220.77,3179.82,26170047900
2429,2023-05-29,3219.76,3221.45,3230.97,3208.96,29738948700
2430,2023-05-30,3214.67,3224.21,3230.63,3186.63,30353246000
2431,2023-05-31,3214.66,3204.56,3216.36,3190.27,29457333800


In [34]:
# 计算20年股票回报
data_20.sort_values('date', inplace=True)
data_20['return'] = (data_20['close'] - data_20['close'].shift(1)) / data_20['close'].shift(1)

# 删除首行
data_20.dropna(subset=['return'], inplace=True)

In [35]:
data_20

Unnamed: 0,date,open,close,high,low,volume,return
1,2003-06-03,1577.08,1562.24,1582.41,1561.08,1110301000,-0.009064
2,2003-06-04,1560.67,1574.11,1574.77,1546.62,1427534300,0.007598
3,2003-06-05,1575.31,1556.25,1575.64,1555.45,1184987300,-0.011346
4,2003-06-06,1553.20,1538.92,1557.76,1538.10,1052004000,-0.011136
5,2003-06-09,1536.27,1530.22,1548.14,1527.84,982743800,-0.005653
...,...,...,...,...,...,...,...
4859,2023-05-26,3198.88,3212.50,3220.77,3179.82,26170047900,0.003511
4860,2023-05-29,3219.76,3221.45,3230.97,3208.96,29738948700,0.002786
4861,2023-05-30,3214.67,3224.21,3230.63,3186.63,30353246000,0.000857
4862,2023-05-31,3214.66,3204.56,3216.36,3190.27,29457333800,-0.006095


In [36]:
# 计算10年股票回报
data_10.sort_values('date', inplace=True)
data_10['return'] = (data_10['close'] - data_10['close'].shift(1)) / data_10['close'].shift(1)

# 删除首行
data_10.dropna(subset=['return'], inplace=True)

In [37]:
data_10

Unnamed: 0,date,open,close,high,low,volume,return
1,2013-06-04,2297.10,2272.42,2297.10,2264.76,10189581800,-0.011669
2,2013-06-05,2270.71,2270.93,2276.86,2260.87,7648330300,-0.000656
3,2013-06-06,2264.43,2242.11,2266.69,2240.07,8153869400,-0.012691
4,2013-06-07,2242.26,2210.90,2250.63,2205.07,8997807700,-0.013920
5,2013-06-13,2190.10,2148.36,2190.10,2126.22,9437109400,-0.028287
...,...,...,...,...,...,...,...
2428,2023-05-26,3198.88,3212.50,3220.77,3179.82,26170047900,0.003511
2429,2023-05-29,3219.76,3221.45,3230.97,3208.96,29738948700,0.002786
2430,2023-05-30,3214.67,3224.21,3230.63,3186.63,30353246000,0.000857
2431,2023-05-31,3214.66,3204.56,3216.36,3190.27,29457333800,-0.006095


In [41]:
data_20.to_excel('shci_data_20y.xlsx')
data_10.to_excel('shci_data_10y.xlsx')