In [15]:
import gzip
import os
import pandas as pd
from io import StringIO
import numpy as np
import datetime
import dateutil.parser as parser

In [24]:
def read_stock_data(data_home, data_type, venue, year, month, day, ticker, is_filter):
    '''
    从sever中读取一只股票一天的数据
    data_home:数据所在folder
    data_type:类型
    venue:交易所
    is_filter:是否进行filter操作
    '''

    path = str(data_home) + '/' + str(data_type) + '/' + str(venue) + '/' + str(year) + '/' + str(
        month) + '/' + str(day) + '/' + str(ticker) + '.' + str(venue)  # 读数据的路径

    if (os.path.exists(path)):
        data0 = open(path, 'r')
        data1 = pd.read_csv(StringIO(data0.read()))
        data = data1.loc[:, ['time', 'volume', 'current','a1_v','a1_p','b1_v','b1_p',]]
        diff_df = data.loc[:, ['time', 'volume']].diff()  # 差分，求出每次交易的交易量、交易额
        data.iloc[1:len(data['volume']), 1] = diff_df.iloc[1:len(data['volume']), 1] # 第0个为NaN,从第一个代替原数据的volume
        data['current'] = data['current']/10000
        data['spread'] = (data['a1_p'] - data['b1_p'])/(data['a1_p'] + data['b1_p'])
        if is_filter == 0:  # 不filter数据的时候
            return data
        else:
            quantile = np.percentile(data['volume'], 99.9)  # 计算volume99%分位数
            data = data[data['volume'] <= quantile]         # 删除大于99.9%分位数的数据
            return data
    else:
        path = str(data_home) + '/' + str(data_type) + '/' + str(venue) + '/' + str(year) + '/' + str(
            month) + '/' + str(day) + '/' + str(ticker) + '.' + str(venue) + '.gz'
        with gzip.open(path, 'rb') as gf:
            data1 = pd.read_csv(gf)
        data = data1.loc[:, ['time', 'volume','current', 'a1_v','b1_v']]
        diff_df = data.loc[:, ['time', 'volume']].diff()  # 差分，求出每次交易的交易量、交易额
        data.iloc[1:len(data['volume']), 1] = diff_df.iloc[1:len(data['volume']), 1] # 第0个为NaN,从第一个代替原数据的volume
        data['current'] = data['current']/10000
        if is_filter == 0:  # 不filter数据的时候
            return data
        else:
            quantile = np.percentile(data['volume'], 99.9)  # 计算volume99%分位数
            data = data[data['volume'] <= quantile]  # 删除大于99.9%分位数的数据
            return data

In [25]:
def read_stock_data_all(data_home, data_type, venue, start_date, end_date, ticker, bin_number, is_filter):
    '''
    读取一只股票所有日期的数据，from start_date to end_date
    '''

    data_concat = pd.DataFrame(columns=['date', 'daily_volume', 'bin_num', 'bin_volume'])

    start_date1 = datetime.datetime.strptime(start_date, '%Y-%m-%d')
    end_date1 = datetime.datetime.strptime(end_date, '%Y-%m-%d')
    interval_day = (end_date1-start_date1).days

    ##遍历日期

    for i in range(interval_day+1):
        date = datetime.datetime.strptime(start_date,'%Y-%m-%d') + datetime.timedelta(days=i)

        date2 = datetime.datetime.strftime(date, '%Y-%m-%d')
        year = date2[0:4]
        month = date2[5:7]
        day = date2[8:10]

        if len(str(month)) < 2:
            month = str(0) + str(month)
        else:
            month = str(month)
        if len(str(day)) < 2:
            day = str(0) + str(day)
        else:
            day = str(day)

        dirs = str(data_home) + '/' + str(data_type) + '/' + str(venue) + '/' + str(year) + '/' + str(
                    month) + '/' + str(day) + '/'


        if not (os.path.exists(dirs)):
            continue
        else:
            print(year, month, day)
            data = get_df(data_home, data_type, venue, year, month, day, ticker, bin_number, is_filter)
            frames = [data_concat, data]
            data_concat = pd.concat(frames)  # 将一只股票多天的数据合并到一个数据框里

    return data_concat.reset_index(drop=True)  # 返回合并后的数据框并重新设置下标


In [26]:
def trans_date(date):  # transform date to year-month-day
    dates = []
    for i in range(len(date)):
        year = str(date[i])[0:4]
        month = str(date[i])[4:6]
        day = str(date[i])[6:8]
        date_std = datetime.date(int(year), int(month), int(day)).isoformat()
        dates.append(date_std)
    return dates


def trans_time(time):  # transform time to hour:minute:second
    times = []
    for i in range(len(time)):
        hour = str(time[i])[8:10]
        minute = str(time[i])[10:12]
        second = str(time[i])[12:14]
        time_std = datetime.time(int(hour), int(minute), int(second)).isoformat()
        times.append(time_std)
    return times

def divide_bin(time, binnumber):  # 计算每条交易所属的bin number
    '''
    time:columns of time
    '''
    n = 237 / (binnumber - 1) * 60
    bin_nums = []
    for i in range(len(time)):
        if datetime.datetime.strptime(time[i],"%H:%M:%S") < datetime.datetime.strptime("09:30:00","%H:%M:%S"):
            bin_num = 0  # 交易发生在9：30之前，bin number为0
        elif datetime.datetime.strptime(time[i],"%H:%M:%S")>datetime.datetime.strptime("15:00:00","%H:%M:%S"):
            bin_num = binnumber+1
        else:
            starttime = parser.parse(datetime.time(9, 30, 0).isoformat())  # 开始时间设为9：30
            endtime = parser.parse(time[i])  # 结束时间是该条数据的交易时间
            s = (endtime - starttime).seconds  # 从开盘到现在的秒数
            if s > -1 and s < 7201:  # 交易发生在9：30-11：30之前
                bin_num = int((s - 0.5) // n) + 1  # 9：30之后的bin number从1开始
            elif s > 12599 and s < 19801:  # 13:00-15:00
                bin_num = int((s - 0.5 - 5400) // n)+1  # 去掉中间的90分钟
            else:
                bin_num = binnumber
        bin_nums.append(bin_num)
    return bin_nums

def cal_bin_volume(subdf, binnumber):
    '''
    subdf:data to be processed,Dataframe
    return:df including one stock,ranked by bin number,Dataframe
    '''

    subdf = subdf[~subdf['bin_num'].isin([binnumber+1])]  # 通过~取反，选取不包含数字25和26的行,保证数据在交易时间段内
    subdf = subdf[~subdf['bin_num'].isin([binnumber])]

    daily_volume = subdf['volume'].groupby(subdf['date']).sum().reset_index()  # 计算每天的volume总量
    bin_volume = subdf['volume'].groupby([subdf['date'], subdf['bin_num']]).sum().reset_index() # 计算每个bin的volume
    subdf1 = pd.merge(daily_volume, bin_volume, how='outer', on='date')  # 把 daily_volume 和 bin_volume 聚合
    return subdf1




In [27]:
def get_df(data_home, data_type, venue, year, month, day,ticker, bin_num, is_filter=0):
    stock_data = read_stock_data(data_home, data_type, venue, year, month, day, ticker, is_filter=0)
    stock_data = stock_data.reset_index(drop=True)
    transdate = trans_date(stock_data['time'])
    transtime = trans_time(stock_data['time'])
    stock_data.loc[:, 'date'] = transdate  # replace the original data by transformed data
    stock_data.loc[:, 'timet'] = transtime
    bin_nums = divide_bin(time=stock_data['timet'], binnumber=bin_num)
    stock_data.loc[:, 'bin_num'] = bin_nums
    vol_df = cal_bin_volume(subdf=stock_data, binnumber=bin_num)
    vol_df = vol_df.rename(columns={'volume_x': 'daily_volume'})
    vol_df = vol_df.rename(columns={'volume_y': 'bin_volume'})
    vol_df['bin_volume'] = vol_df['bin_volume'].fillna(1)  # 空值用1填充
    vol_df['daily_volume'] = vol_df['daily_volume'].fillna(method = 'bfill')  # 空值用向上填充
    return vol_df

In [20]:
data_home = '/volume1/sinoalgo/data/sinoalgo/JQMarketData'
data_types = ['STOCK', 'STOCK', 'STOCK', 'STOCK', 'STOCK', 'STOCK', 'STOCK', 'STOCK', 'STOCK', 'STOCK', 'STOCK']
venues = ['XSHE', 'XSHE', 'XSHE', 'XSHE', 'XSHE', 'XSHE', 'XSHE', 'XSHE', 'XSHE', 'XSHE', 'XSHE']
tickers = ['000725', '300015', '300185', '000002', '000807', '002340', '000001','300750','300059','000166','000009']
start_date = "2020-09-01"
end_date = "2021-06-30"
bin_num = 25
def data_generating_all(data_home, data_types, venues, tickers,start_date,end_date,bin_num, is_filter=0):
    data_home = data_home
    for i in range(len(data_types)):
        data_type = data_types[i]
        venue = venues[i]
        ticker = tickers[i]
        result_df = read_stock_data_all(data_home, data_type, venue, start_date, end_date, ticker, bin_num, is_filter=0)
        filename_basic = '/volume1/home/rzhu/winsorize/' + str(ticker) + '_' + str(venue) + '_' + str(
            bin_num) +'_'+'daily.csv'
        result_df.to_csv(filename_basic, index=False)

data_generating_all(data_home,data_types,venues,tickers,start_date,end_date,bin_num,is_filter=0)

2020 09 01
2020 09 02
2020 09 03
2020 09 04
2020 09 07
2020 09 08
2020 09 09


KeyboardInterrupt: 

In [30]:
data_home = '/volume1/sinoalgo/data/sinoalgo/JQMarketData'
data_type = ['STOCK']
venue = ['XSHE']
ticker= ['000725']
start_date = "2020-09-01"
end_date = "2021-06-30"
bin_numnber = 25

In [31]:
read_stock_data_all(data_home, data_type, venue, start_date, end_date, ticker, 25, is_filter=0)

Unnamed: 0,date,daily_volume,bin_num,bin_volume
