In [1]:
import os
import gzip
import pandas as pd
from io import StringIO
import numpy as np
import datetime
import dateutil.parser as parser
def read_stock_data(data_home, data_type, venue, year, month, day, ticker, is_filter):
    '''
    从sever中读取一只股票一天的数据
    data_home:数据所在folder
    data_type:类型
    venue:交易所
    is_filter:是否进行filter操作
    '''

    path = str(data_home) + '/' + str(data_type) + '/' + str(venue) + '/' + str(year) + '/' + str(
        month) + '/' + str(day) + '/' + str(ticker) + '.' + str(venue)  # 读数据的路径

    if (os.path.exists(path)):
        data0 = open(path, 'r')
        data1 = pd.read_csv(StringIO(data0.read()))
        data = data1.loc[:, ['time', 'volume', 'current','a1_v','a1_p','b1_v','b1_p',]]
        diff_df = data.loc[:, ['time', 'volume']].diff()  # 差分，求出每次交易的交易量、交易额
        data.iloc[1:len(data['volume']), 1] = diff_df.iloc[1:len(data['volume']), 1] # 第0个为NaN,从第一个代替原数据的volume
        data['current'] = data['current']/10000
        data['spread'] = (data['a1_p'] - data['b1_p'])/(data['a1_p'] + data['b1_p'])
        if is_filter == 0:  # 不filter数据的时候
            return data
        else:
            quantile = np.percentile(data['volume'], 99.9)  # 计算volume99%分位数
            data = data[data['volume'] <= quantile]         # 删除大于99.9%分位数的数据
            return data
    else:
        path = str(data_home) + '/' + str(data_type) + '/' + str(venue) + '/' + str(year) + '/' + str(
            month) + '/' + str(day) + '/' + str(ticker) + '.' + str(venue) + '.gz'
        with gzip.open(path, 'rb') as gf:
            data1 = pd.read_csv(gf)
        data = data1.loc[:, ['time', 'volume', 'current','a1_v','a1_p','b1_v','b1_p',]]
        diff_df = data.loc[:, ['time', 'volume']].diff()  # 差分，求出每次交易的交易量、交易额
        data.iloc[1:len(data['volume']), 1] = diff_df.iloc[1:len(data['volume']), 1] # 第0个为NaN,从第一个代替原数据的volume
        data['current'] = data['current']/10000
        data['spread'] = (data['a1_p'] - data['b1_p'])/(data['a1_p'] + data['b1_p'])
        if is_filter == 0:  # 不filter数据的时候
            return data
        else:
            quantile = np.percentile(data['volume'], 99.9)  # 计算volume99%分位数
            data = data[data['volume'] <= quantile]  # 删除大于99.9%分位数的数据
            return data

In [39]:
data_home = '/volume1/sinoalgo/data/sinoalgo/JQMarketData'
data_types = 'STOCK'
venues = 'XSHE'

In [42]:
df=read_stock_data(data_home, data_types, venues, '2020','09', '01', '000807', is_filter=0)

In [43]:
df 

Unnamed: 0,time,volume,current,a1_v,a1_p,b1_v,b1_p,spread
0,20200901091509000,0,0.00,9300,61100,9300,61100,0.000000
1,20200901091518000,0,0.00,9500,60800,9500,60800,0.000000
2,20200901091536000,0,0.00,9500,60800,9500,60800,0.000000
3,20200901091545000,0,0.00,9500,60800,9500,60800,0.000000
4,20200901091603000,0,0.00,9500,60800,9500,60800,0.000000
...,...,...,...,...,...,...,...,...
4747,20200901145924000,0,6.13,197500,61200,197500,61200,0.000000
4748,20200901145933000,0,6.13,200600,61200,200600,61200,0.000000
4749,20200901145942000,0,6.13,355400,61300,355400,61300,0.000000
4750,20200901145951000,0,6.13,415054,61300,415054,61300,0.000000


In [70]:
windows=df['volume'][1403:1503]

In [71]:
windows

1403      100
1404        0
1405        0
1406      100
1407      200
        ...  
1498    13100
1499     3700
1500    41100
1501    48400
1502    26500
Name: volume, Length: 100, dtype: int64

In [72]:
windows = (windows - np.mean(windows)) / np.std(windows, ddof=1) 

In [73]:
windows

1403   -0.410182
1404   -0.414281
1405   -0.414281
1406   -0.410182
1407   -0.406083
          ...   
1498    0.122666
1499   -0.262624
1500    1.270339
1501    1.569554
1502    0.671910
Name: volume, Length: 100, dtype: float64

In [74]:
volatility = np.std(windows)

In [75]:
volatility 

0.9949874371066201