In [1]:
import os
import gzip
import pandas as pd
from io import StringIO
import numpy as np
import datetime
import dateutil.parser as parser
def read_stock_data(data_home, data_type, venue, year, month, day, ticker, is_filter):
    '''
    从sever中读取一只股票一天的数据
    data_home:数据所在folder
    data_type:类型
    venue:交易所
    is_filter:是否进行filter操作
    '''

    path = str(data_home) + '/' + str(data_type) + '/' + str(venue) + '/' + str(year) + '/' + str(
        month) + '/' + str(day) + '/' + str(ticker) + '.' + str(venue)  # 读数据的路径

    if (os.path.exists(path)):
        data0 = open(path, 'r')
        data1 = pd.read_csv(StringIO(data0.read()))
        data = data1.loc[:, ['time', 'volume', 'current',]]
        diff_df = data.loc[:, ['time', 'volume']].diff()  # 差分，求出每次交易的交易量、交易额
        data.iloc[1:len(data['volume']), 1] = diff_df.iloc[1:len(data['volume']), 1] # 第0个为NaN,从第一个代替原数据的volume
        data['current'] = data['current']/10000
        if is_filter == 0:  # 不filter数据的时候
            return data
        else:
            quantile = np.percentile(data['volume'], 99.9)  # 计算volume99%分位数
            data = data[data['volume'] <= quantile]         # 删除大于99.9%分位数的数据
            return data
    else:
        path = str(data_home) + '/' + str(data_type) + '/' + str(venue) + '/' + str(year) + '/' + str(
            month) + '/' + str(day) + '/' + str(ticker) + '.' + str(venue) + '.gz'
        with gzip.open(path, 'rb') as gf:
            data1 = pd.read_csv(gf)
        data = data1.loc[:, ['time', 'volume', 'current',]]
        diff_df = data.loc[:, ['time', 'volume']].diff()  # 差分，求出每次交易的交易量、交易额
        data.iloc[1:len(data['volume']), 1] = diff_df.iloc[1:len(data['volume']), 1] # 第0个为NaN,从第一个代替原数据的volume
        data['current'] = data['current']/10000
        if is_filter == 0:  # 不filter数据的时候
            return data
        else:
            quantile = np.percentile(data['volume'], 99.9)  # 计算volume99%分位数
            data = data[data['volume'] <= quantile]  # 删除大于99.9%分位数的数据
            return data

In [2]:
def read_stock_data_all(data_home, data_type, venue, start_date, end_date, ticker, is_filter):
    '''
    读取一只股票所有日期的数据，from start_date to end_date
    '''

    data_concat = pd.DataFrame()

    start_date1 = datetime.datetime.strptime(start_date, '%Y-%m-%d')
    end_date1 = datetime.datetime.strptime(end_date, '%Y-%m-%d')
    interval_day = (end_date1-start_date1).days

    ##遍历日期

    for i in range(interval_day+1):
        date = datetime.datetime.strptime(start_date,'%Y-%m-%d') + datetime.timedelta(days=i)

        date2 = datetime.datetime.strftime(date, '%Y-%m-%d')
        year = date2[0:4]
        month = date2[5:7]
        day = date2[8:10]

        if len(str(month)) < 2:
            month = str(0) + str(month)
        else:
            month = str(month)
        if len(str(day)) < 2:
            day = str(0) + str(day)
        else:
            day = str(day)

        dirs = str(data_home) + '/' + str(data_type) + '/' + str(venue) + '/' + str(year) + '/' + str(
                    month) + '/' + str(day) + '/'


        if not (os.path.exists(dirs)):
            continue
        else:
            print(year, month, day)
            data = read_stock_data(data_home, data_type, venue, year, month, day, ticker, is_filter)
            frames = [data_concat, data]
            data_concat = pd.concat(frames)  # 将一只股票多天的数据合并到一个数据框里

    return data_concat.reset_index(drop=True)  # 返回合并后的数据框并重新设置下标

In [3]:
def trans_date(date):  # transform date to year-month-day
    dates = []
    for i in range(len(date)):
        year = str(date[i])[0:4]
        month = str(date[i])[4:6]
        day = str(date[i])[6:8]
        date_std = datetime.date(int(year), int(month), int(day)).isoformat()
        dates.append(date_std)
    return dates


def trans_time(time):  # transform time to hour:minute:second
    times = []
    for i in range(len(time)):
        hour = str(time[i])[8:10]
        minute = str(time[i])[10:12]
        second = str(time[i])[12:14]
        time_std = datetime.time(int(hour), int(minute), int(second)).isoformat()
        times.append(time_std)
    return times

In [4]:
def winsorize(data_home, data_type, venue, start_date, end_date, ticker, is_filter):
    data = read_stock_data_all(data_home, data_type, venue, start_date, end_date, ticker, is_filter)
    df = pd.DataFrame(data)
    transdate = trans_date(df['time'])
    transtime = trans_time(df['time'])
    df.loc[:, 'date'] = transdate  # replace the original data by transformed data
    df.loc[:, 'timet'] = transtime
    df['timet'] = pd.to_datetime(df['timet'], format='%H:%M:%S')
    cutoff_time = datetime.datetime.strptime('14:57:00', '%H:%M:%S')
    filtered_data = df[df['timet'] <= cutoff_time]
    filtered_data.sort_values(['time', 'date', 'volume'], inplace=True)
    filtered_data.loc[:, 'quantile_999'] = filtered_data['volume'].transform(lambda x: x.rolling(window=24070).quantile(0.999))
    filtered_data.loc[:, 'winsorized_volume'] = np.where(filtered_data['volume'] >= filtered_data['quantile_999'], filtered_data['quantile_999'], filtered_data['volume'])
    return filtered_data

In [5]:
def divide_bin(time, binnumber):  # 计算每条交易所属的bin number
    '''
    time:columns of time
    '''
    n = 237 / (binnumber - 1) * 60
    bin_nums = []
    for i in range(len(time)):
        if datetime.datetime.strptime(time[i],"%H:%M:%S") < datetime.datetime.strptime("09:30:00","%H:%M:%S"):
            bin_num = 0  # 交易发生在9：30之前，bin number为0
        elif datetime.datetime.strptime(time[i],"%H:%M:%S")>datetime.datetime.strptime("15:00:00","%H:%M:%S"):
            bin_num = binnumber+1
        else:
            starttime = parser.parse(datetime.time(9, 30, 0).isoformat())  # 开始时间设为9：30
            endtime = parser.parse(time[i])  # 结束时间是该条数据的交易时间
            s = (endtime - starttime).seconds  # 从开盘到现在的秒数
            if s > -1 and s < 7201:  # 交易发生在9：30-11：30之前
                bin_num = int((s - 0.5) // n) + 1  # 9：30之后的bin number从1开始
            elif s > 12599 and s < 19801:  # 13:00-15:00
                bin_num = int((s - 0.5 - 5400) // n)+1  # 去掉中间的90分钟
            else:
                bin_num = binnumber
        bin_nums.append(bin_num)
    return bin_nums

####### volatility、quote imbalance、spread 
def cal_bin_volume(subdf, binnumber):
    '''
    subdf: data to be processed, DataFrame
    return: DataFrame including one stock, ranked by bin number
    '''

    subdf = subdf[~subdf['bin_num'].isin([binnumber + 1])]  # Exclude rows with bin number binnumber+1
    subdf = subdf[~subdf['bin_num'].isin([binnumber])]  # Exclude rows with bin number binnumber
    daily_volume = subdf['winsorized_volume'].groupby(subdf['date']).sum().reset_index()  # Calculate total volume for each day
    bin_volume = subdf['winsorized_volume'].groupby([subdf['date'], subdf['bin_num']]).sum().reset_index()  # Calculate volume for each bin
    subdf1 = pd.merge(daily_volume, bin_volume, how='outer', on='date')  # Merge daily_volume and bin_volume
    return subdf1

In [6]:
def get_df(data_home, data_type, venue,start_date, end_date,ticker, bin_num, is_filter=0):
    stock_data = winsorize(data_home, data_type, venue, start_date, end_date, ticker, is_filter)
    stock_data = stock_data.reset_index(drop=True)
    transdate = trans_date(stock_data['time'])
    transtime = trans_time(stock_data['time'])
    stock_data.loc[:, 'date'] = transdate  # replace the original data by transformed data
    stock_data.loc[:, 'timet'] = transtimedef data_generating_all(data_home, data_types, venues, tickers,start_date,end_date,bin_num, is_filter=0):
    data_home = data_home
    for i in range(len(data_types)):
        data_type = data_types[i]
        venue = venues[i]
        ticker = tickers[i]
        result_df = get_df(data_home, data_type, venue, start_date, end_date, ticker, bin_num, is_filter=0)
    bin_nums = divide_bin(time=stock_data['timet'], binnumber=bin_num)
    stock_data.loc[:, 'bin_num'] = bin_nums
    vol_df = cal_bin_volume(subdf=stock_data, binnumber=bin_num)
    vol_df = vol_df.rename(columns={'winsorized_volume_x': 'daily_volume'})
    vol_df = vol_df.rename(columns={'winsorized_volume_y': 'bin_volume'})
    vol_df['bin_volume'] = vol_df['bin_volume'].fillna(1)  # 空值用1填充
    vol_df['daily_volume'] = vol_df['daily_volume'].fillna(method = 'bfill')  # 空值用向上填充
    return vol_df

In [10]:
data_home = '/volume1/sinoalgo/data/sinoalgo/JQMarketData'
data_types = ['INDEX']
venues = ['XSHG']
tickers = ['000001']
start_date = "2020-09-01"
end_date = "2021-06-30"
bin_num = 25
def data_generating_all(data_home, data_types, venues, tickers,start_date,end_date,bin_num, is_filter=0):
    data_home = data_home
    for i in range(len(data_types)):
        data_type = data_types[i]
        venue = venues[i]
        ticker = tickers[i]
        result_df = get_df(data_home, data_type, venue, start_date, end_date, ticker, bin_num, is_filter=0)
        filename_basic = '/volume1/home/rzhu/LHH/result/' + str(ticker) + '_' + str(venue) + '_' + str(
            bin_num) +'_'+'daily.csv'
        result_df.to_csv(filename_basic, index=False)

data_generating_all(data_home,data_types,venues,tickers,start_date,end_date,bin_num,is_filter=1)

2020 09 01
2020 09 02
2020 09 03
2020 09 04
2020 09 07
2020 09 08
2020 09 09
2020 09 10
2020 09 11
2020 09 14
2020 09 15
2020 09 16
2020 09 17
2020 09 18
2020 09 21
2020 09 22
2020 09 23
2020 09 24
2020 09 25
2020 09 28
2020 09 29
2020 09 30
2020 10 09
2020 10 12
2020 10 13
2020 10 14
2020 10 15
2020 10 16
2020 10 19
2020 10 20
2020 10 21
2020 10 22
2020 10 23
2020 10 26
2020 10 27
2020 10 28
2020 10 29
2020 10 30
2020 11 02
2020 11 03
2020 11 04
2020 11 05
2020 11 06
2020 11 09
2020 11 10
2020 11 11
2020 11 12
2020 11 13
2020 11 16
2020 11 17
2020 11 18
2020 11 19
2020 11 20
2020 11 23
2020 11 24
2020 11 25
2020 11 26
2020 11 27
2020 11 30
2020 12 01
2020 12 02
2020 12 03
2020 12 04
2020 12 07
2020 12 08
2020 12 09
2020 12 10
2020 12 11
2020 12 14
2020 12 15
2020 12 16
2020 12 17
2020 12 18
2020 12 21
2020 12 22
2020 12 23
2020 12 24
2020 12 25
2020 12 28
2020 12 29
2020 12 30
2020 12 31
2021 01 04
2021 01 05
2021 01 06
2021 01 07
2021 01 08
2021 01 11
2021 01 12
2021 01 13
2021 01 14

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.sort_values(['time', 'date', 'volume'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.loc[:, 'quantile_999'] = filtered_data['volume'].transform(lambda x: x.rolling(window=24070).quantile(0.999))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.loc[:, 'winsorized_volume'] = np.where(filtered_data['vo

In [129]:
data_home = '/volume1/sinoalgo/data/sinoalgo/JQMarketData'
data_type = 'STOCK'
venue= 'XSHE'
ticker = '000725'
start_date = "2020-09-01"
end_date = "2020-10-01"
data = winsorize(data_home, data_type, venue, start_date, end_date, ticker, is_filter=0)
data

2020 09 01
2020 09 02
2020 09 03
2020 09 04
2020 09 07
2020 09 08
2020 09 09
2020 09 10
2020 09 11
2020 09 14
2020 09 15
2020 09 16
2020 09 17
2020 09 18
2020 09 21
2020 09 22
2020 09 23
2020 09 24
2020 09 25
2020 09 28
2020 09 29
2020 09 30


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.sort_values(['time', 'date', 'volume'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.loc[:, 'quantile_999'] = filtered_data['volume'].transform(lambda x: x.rolling(window=24070).quantile(0.999))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.loc[:, 'winsorized_volume'] = np.where(filtered_data['vo

Unnamed: 0,time,volume,current,a1_v,a1_p,b1_v,b1_p,spread,date,timet,quantile_999,winsorized_volume
0,20200901091500000,0,0.00,9300,55300,9300,55300,0.000000,2020-09-01,1900-01-01 09:15:00,,0.0
1,20200901091509000,0,0.00,532400,55500,532400,55500,0.000000,2020-09-01,1900-01-01 09:15:09,,0.0
2,20200901091518000,0,0.00,547300,55500,547300,55500,0.000000,2020-09-01,1900-01-01 09:15:18,,0.0
3,20200901091527000,0,0.00,547500,55500,547500,55500,0.000000,2020-09-01,1900-01-01 09:15:27,,0.0
4,20200901091536000,0,0.00,571900,55500,571900,55500,0.000000,2020-09-01,1900-01-01 09:15:36,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
106342,20200930145648000,113300,4.91,246437,49100,2258600,49000,0.001019,2020-09-30,1900-01-01 14:56:48,3615518.096,113300.0
106343,20200930145651000,158400,4.91,504337,49100,2540500,49000,0.001019,2020-09-30,1900-01-01 14:56:51,3615518.096,158400.0
106344,20200930145654000,195200,4.91,392537,49100,2537100,49000,0.001019,2020-09-30,1900-01-01 14:56:54,3615518.096,195200.0
106345,20200930145657000,65600,4.91,344137,49100,2358600,49000,0.001019,2020-09-30,1900-01-01 14:56:57,3615518.096,65600.0


In [176]:
stock_data = winsorize(data_home, data_type, venue, start_date, end_date, ticker, is_filter=0
                      )
stock_data = stock_data.reset_index(drop=True)
stock_data = stock_data.reset_index(drop=True)
transdate = trans_date(stock_data['time'])
transtime = trans_time(stock_data['time'])
stock_data.loc[:, 'date'] = transdate  # replace the original data by transformed data
stock_data.loc[:, 'timet'] = transtime

2020 09 01
2020 09 02
2020 09 03
2020 09 04
2020 09 07
2020 09 08
2020 09 09
2020 09 10
2020 09 11
2020 09 14
2020 09 15
2020 09 16
2020 09 17
2020 09 18
2020 09 21
2020 09 22
2020 09 23
2020 09 24
2020 09 25
2020 09 28
2020 09 29
2020 09 30
2020 10 09
2020 10 12
2020 10 13
2020 10 14
2020 10 15
2020 10 16
2020 10 19
2020 10 20
2020 10 21
2020 10 22
2020 10 23
2020 10 26
2020 10 27
2020 10 28
2020 10 29
2020 10 30
2020 11 02
2020 11 03
2020 11 04
2020 11 05
2020 11 06
2020 11 09
2020 11 10
2020 11 11
2020 11 12
2020 11 13
2020 11 16
2020 11 17
2020 11 18
2020 11 19
2020 11 20
2020 11 23
2020 11 24
2020 11 25
2020 11 26
2020 11 27
2020 11 30
2020 12 01
2020 12 02
2020 12 03
2020 12 04
2020 12 07
2020 12 08
2020 12 09
2020 12 10
2020 12 11
2020 12 14
2020 12 15
2020 12 16
2020 12 17
2020 12 18
2020 12 21
2020 12 22
2020 12 23
2020 12 24
2020 12 25
2020 12 28
2020 12 29
2020 12 30
2020 12 31
2021 01 04
2021 01 05
2021 01 06
2021 01 07
2021 01 08
2021 01 11
2021 01 12
2021 01 13
2021 01 14

KeyboardInterrupt: 

In [139]:
stock_data

Unnamed: 0,time,volume,current,a1_v,a1_p,b1_v,b1_p,spread,date,timet,quantile_999,winsorized_volume
0,20200901091500000,0,0.00,9300,55300,9300,55300,0.000000,2020-09-01,09:15:00,,0.0
1,20200901091509000,0,0.00,532400,55500,532400,55500,0.000000,2020-09-01,09:15:09,,0.0
2,20200901091518000,0,0.00,547300,55500,547300,55500,0.000000,2020-09-01,09:15:18,,0.0
3,20200901091527000,0,0.00,547500,55500,547500,55500,0.000000,2020-09-01,09:15:27,,0.0
4,20200901091536000,0,0.00,571900,55500,571900,55500,0.000000,2020-09-01,09:15:36,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
105920,20200930145648000,113300,4.91,246437,49100,2258600,49000,0.001019,2020-09-30,14:56:48,3615518.096,113300.0
105921,20200930145651000,158400,4.91,504337,49100,2540500,49000,0.001019,2020-09-30,14:56:51,3615518.096,158400.0
105922,20200930145654000,195200,4.91,392537,49100,2537100,49000,0.001019,2020-09-30,14:56:54,3615518.096,195200.0
105923,20200930145657000,65600,4.91,344137,49100,2358600,49000,0.001019,2020-09-30,14:56:57,3615518.096,65600.0


In [143]:
    bin_num = 25
    bin_nums = divide_bin(time=stock_data['timet'], binnumber=bin_num)
    stock_data.loc[:, 'bin_num'] = bin_nums
    vol_df = cal_bin_volume(subdf=stock_data, binnumber=bin_num)
    vol_df = vol_df.rename(columns={'volume_x': 'daily_volume'})
    vol_df = vol_df.rename(columns={'volume_y': 'bin_volume'})
    vol_df['bin_volume'] = vol_df['bin_volume'].fillna(1)  # 空值用1填充
    vol_df['daily_volume'] = vol_df['daily_volume'].fillna(method = 'bfill')  # 空值用向上填充

In [144]:
vol_df

Unnamed: 0,date,daily_volume,bin_num,bin_volume,spread,quote_imbalance
0,2020-09-01,728634685,0,12088700,0.000073,-0.052452
1,2020-09-01,728634685,1,105230987,0.000918,-0.207591
2,2020-09-01,728634685,2,52650593,0.000898,-0.165477
3,2020-09-01,728634685,3,69485067,0.000896,-0.224107
4,2020-09-01,728634685,4,43440941,0.000903,0.112095
...,...,...,...,...,...,...
545,2020-09-30,479723894,20,34269244,0.001026,-0.081190
546,2020-09-30,479723894,21,25944167,0.001021,0.283820
547,2020-09-30,479723894,22,30875837,0.001020,-0.100478
548,2020-09-30,479723894,23,29295463,0.001019,-0.082857


In [123]:
def trans_date(date):  # transform date to year-month-day
    dates = []
    for i in range(len(date)):
        year = str(date[i])[0:4]
        month = str(date[i])[4:6]
        day = str(date[i])[6:8]
        date_std = datetime.date(int(year), int(month), int(day)).isoformat()
        dates.append(date_std)
    return dates


def trans_time(time):  # transform time to hour:minute:second
    times = []
    for i in range(len(time)):
        hour = str(time[i])[8:10]
        minute = str(time[i])[10:12]
        second = str(time[i])[12:14]
        time_std = datetime.time(int(hour), int(minute), int(second)).isoformat()
        times.append(time_std)
    return times

In [124]:
df = pd.DataFrame(data)
transdate = trans_date(df['time'])
transtime = trans_time(df['time'])
df.loc[:, 'date'] = transdate  # replace the original data by transformed data
df.loc[:, 'timet'] = transtime
df

Unnamed: 0,time,volume,current,a1_v,a1_p,b1_v,b1_p,spread,date,timet
0,20200901091500000,0,0.00,9300,55300,9300,55300,0.000000,2020-09-01,09:15:00
1,20200901091509000,0,0.00,532400,55500,532400,55500,0.000000,2020-09-01,09:15:09
2,20200901091518000,0,0.00,547300,55500,547300,55500,0.000000,2020-09-01,09:15:18
3,20200901091527000,0,0.00,547500,55500,547500,55500,0.000000,2020-09-01,09:15:27
4,20200901091536000,0,0.00,571900,55500,571900,55500,0.000000,2020-09-01,09:15:36
...,...,...,...,...,...,...,...,...,...,...
106362,20200930145924000,0,4.91,6550537,49100,6550537,49100,0.000000,2020-09-30,14:59:24
106363,20200930145933000,0,4.91,6605037,49100,6605037,49100,0.000000,2020-09-30,14:59:33
106364,20200930145942000,0,4.91,6648437,49100,6648437,49100,0.000000,2020-09-30,14:59:42
106365,20200930145951000,0,4.91,7210200,49100,7210200,49100,0.000000,2020-09-30,14:59:51


In [125]:
df['timet'] = pd.to_datetime(df['timet'], format='%H:%M:%S')
cutoff_time = datetime.datetime.strptime('14:57:00', '%H:%M:%S')
filtered_data = df[df['timet'] <= cutoff_time]

In [126]:
filtered_data.sort_values(['time', 'date', 'volume'], inplace=True)
filtered_data.loc[:, 'quantile_999'] = filtered_data['volume'].transform(lambda x: x.rolling(window=24070).quantile(0.999))
filtered_data.loc[:, 'winsorized_volume'] = np.where(filtered_data['volume'] > filtered_data['quantile_999'], filtered_data['quantile_999'], filtered_data['volume'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.sort_values(['time', 'date', 'volume'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.loc[:, 'quantile_999'] = filtered_data['volume'].transform(lambda x: x.rolling(window=24070).quantile(0.999))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.loc[:, 'winsorized_volume'] = np.where(filtered_data['vo

In [168]:
filtered_data.to_csv('/volume1/home/rzhu/LHH/result/1.csv')

In [94]:
    stock_data = winsorize(data_home, data_type, venue, start_date, end_date, ticker, is_filter)
    stock_data = stock_data.reset_index(drop=True)
    transdate = trans_date(stock_data['time'])
    transtime = trans_time(stock_data['time'])
    stock_data.loc[:, 'date'] = transdate  # replace the original data by transformed data
    stock_data.loc[:, 'timet'] = transtime
    bin_nums = divide_bin(time=stock_data['timet'], binnumber=bin_num)
    stock_data.loc[:, 'bin_num'] = bin_nums
    vol_df = cal_bin_volume(subdf=stock_data, binnumber=bin_num)

In [None]:
def winsorize(data_home, data_type, venue, start_date, end_date, ticker, is_filter):
    data = read_stock_data_all(data_home, data_type, venue, start_date, end_date, ticker, is_filter)
    df = pd.DataFrame(data)
    transdate = trans_date(df['time'])
    transtime = trans_time(df['time'])
    df.loc[:, 'date'] = transdate  # replace the original data by transformed data
    df.loc[:, 'timet'] = transtime
    df['timet'] = pd.to_datetime(df['timet'], format='%H:%M:%S')
    cutoff_time = datetime.datetime.strptime('14:57:00', '%H:%M:%S')
    filtered_data = df[df['timet'] <= cutoff_time]
    filtered_data.sort_values(['time', 'date', 'volume'], inplace=True)
    filtered_data.loc[:, 'quantile_999'] = filtered_data['volume'].transform(lambda x: x.rolling(window=24070).quantile(0.999))
    filtered_data.loc[:, 'winsorized_volume'] = np.where(filtered_data['volume'] >= filtered_data['quantile_999'], filtered_data['quantile_999'], filtered_data['volume'])
    return filtered_data