In [None]:
import gzip
import os
import pandas as pd
from io import StringIO
import numpy as np
import datetime
import dateutil.parser as parser

In [45]:
def read_stock_data(data_home, data_type, venue, year, month, day, ticker, is_filter):
    '''
    从sever中读取一只股票一天的数据
    data_home:数据所在folder
    data_type:类型
    venue:交易所
    is_filter:是否进行filter操作
    '''

    path = str(data_home) + '/' + str(data_type) + '/' + str(venue) + '/' + str(year) + '/' + str(
        month) + '/' + str(day) + '/' + str(ticker) + '.' + str(venue)  # 读数据的路径

    if (os.path.exists(path)):
        data0 = open(path, 'r')
        data1 = pd.read_csv(StringIO(data0.read()))
        data = data1.loc[:, ['time', 'volume', 'current','a1_v','a1_p','b1_v','b1_p',]]
        diff_df = data.loc[:, ['time', 'volume']].diff()  # 差分，求出每次交易的交易量、交易额
        data.iloc[1:len(data['volume']), 1] = diff_df.iloc[1:len(data['volume']), 1] # 第0个为NaN,从第一个代替原数据的volume
        data['current'] = data['current']/10000
        data['spread'] = (data['a1_p'] - data['b1_p'])/(data['a1_p'] + data['b1_p'])  # 计算 spread
        data['quote_imbalance'] = (data['b1_v'] - data['a1_v']) / (data['b1_v'] + data['a1_v'])
        if is_filter == 0:  # 不filter数据的时候
            return data
        else:
            quantile = np.percentile(data['volume'], 99.5)
            quantile2 = np.percentile(data['volume'], 0.5) 
            data = data[(quantile2 <= data['volume']) & (data['volume'] <= quantile)]  
            return data
    else:
        path = str(data_home) + '/' + str(data_type) + '/' + str(venue) + '/' + str(year) + '/' + str(
            month) + '/' + str(day) + '/' + str(ticker) + '.' + str(venue) + '.gz'
        with gzip.open(path, 'rb') as gf:
            data1 = pd.read_csv(gf)
        data = data1.loc[:, ['time', 'volume','current', 'a1_v','a1_p','b1_v','b1_p',]]
        diff_df = data.loc[:, ['time', 'volume']].diff()  # 差分，求出每次交易的交易量、交易额
        data.iloc[1:len(data['volume']), 1] = diff_df.iloc[1:len(data['volume']), 1] # 第0个为NaN,从第一个代替原数据的volume
        data['current'] = data['current']/10000
        data['quote_imbalance'] = (data['b1_v'] - data['a1_v']) / (data['b1_v'] + data['a1_v'])
        if is_filter == 0:  # 不filter数据的时候
            return data
        else:
            quantile = np.percentile(data['volume'], 99.5)
            quantile2 = np.percentile(data['volume'], 0.5) 
            data = data[(quantile2 <= data['volume']) & (data['volume'] <= quantile)]  
            return data
def trans_date(date):  # transform date to year-month-day
    dates = []
    for i in range(len(date)):
        year = str(date[i])[0:4]
        month = str(date[i])[4:6]
        day = str(date[i])[6:8]
        date_std = datetime.date(int(year), int(month), int(day)).isoformat()
        dates.append(date_std)
    return dates


def trans_time(time):  # transform time to hour:minute:second
    times = []
    for i in range(len(time)):
        hour = str(time[i])[8:10]
        minute = str(time[i])[10:12]
        second = str(time[i])[12:14]
        time_std = datetime.time(int(hour), int(minute), int(second)).isoformat()
        times.append(time_std)
    return times

def divide_bin(time, binnumber):  # 计算每条交易所属的bin number
    '''
    time:columns of time
    '''
    n = 237 / (binnumber - 1) * 60
    bin_nums = []
    for i in range(len(time)):
        if datetime.datetime.strptime(time[i],"%H:%M:%S") < datetime.datetime.strptime("09:30:00","%H:%M:%S"):
            bin_num = 0  # 交易发生在9：30之前，bin number为0
        elif datetime.datetime.strptime(time[i],"%H:%M:%S")>datetime.datetime.strptime("15:00:00","%H:%M:%S"):
            bin_num = binnumber+1
        else:
            starttime = parser.parse(datetime.time(9, 30, 0).isoformat())  # 开始时间设为9：30
            endtime = parser.parse(time[i])  # 结束时间是该条数据的交易时间
            s = (endtime - starttime).seconds  # 从开盘到现在的秒数
            if s > -1 and s < 7201:  # 交易发生在9：30-11：30之前
                bin_num = int((s - 0.5) // n) + 1  # 9：30之后的bin number从1开始
            elif s > 12599 and s < 19801:  # 13:00-15:00
                bin_num = int((s - 0.5 - 5400) // n)+1  # 去掉中间的90分钟
            else:
                bin_num = binnumber
        bin_nums.append(bin_num)
    return bin_nums

####### volatility imbalance
def cal_bin_volume(subdf, binnumber):
    '''
    subdf: data to be processed, DataFrame
    return: DataFrame including one stock, ranked by bin number
    '''

    subdf = subdf[~subdf['bin_num'].isin([binnumber + 1])]  # Exclude rows with bin number binnumber+1
    subdf = subdf[~subdf['bin_num'].isin([binnumber])]  # Exclude rows with bin number binnumber

    daily_volume = subdf['volume'].groupby(subdf['date']).sum().reset_index()  # Calculate total volume for each day
    bin_volume = subdf['volume'].groupby([subdf['date'], subdf['bin_num']]).sum().reset_index()  # Calculate volume for each bin
    
    # 计算标准差
    
    volatility = subdf['current'].groupby([subdf['date'], subdf['bin_num']]).std().reset_index().rename(columns={'current': 'volatility'})
    df = pd.merge(daily_volume, bin_volume, how='outer', on='date')  # Merge daily_volume and bin_volume
    subdf1 = pd.merge(df, volatility, how='outer', on=['date', 'bin_num'])

    def exponential_weighted_average(numbers, alpha):
        n = len(numbers)
        weights = np.array([alpha ** (n - 1 - i) for i in range(n)])  # 使用倒序的权重计算
        weighted_sum = np.sum(np.multiply(numbers, weights))
        weight_sum = np.sum(weights)
        ewma = weighted_sum / weight_sum
        return ewma

    
    imbalance = subdf['quote_imbalance'].groupby([subdf['date'], subdf['bin_num']]).apply(lambda x: exponential_weighted_average(x, 0.9)).reset_index()
    subdf1 = pd.merge(subdf1,imbalance,how='outer',on=['date','bin_num'])
    return subdf1


def get_df(data_home, data_type, venue, year, month, day,ticker, bin_num, is_filter=0):
    stock_data = read_stock_data(data_home, data_type, venue, year, month, day, ticker, is_filter=0)
    stock_data = stock_data.reset_index(drop=True)
    transdate = trans_date(stock_data['time'])
    transtime = trans_time(stock_data['time'])
    stock_data.loc[:, 'date'] = transdate  # replace the original data by transformed data
    stock_data.loc[:, 'timet'] = transtime
    bin_nums = divide_bin(time=stock_data['timet'], binnumber=bin_num)
    stock_data.loc[:, 'bin_num'] = bin_nums
    vol_df = cal_bin_volume(subdf=stock_data, binnumber=bin_num)
    vol_df = vol_df.rename(columns={'volume_x': 'daily_volume'})
    vol_df = vol_df.rename(columns={'volume_y': 'bin_volume'})
    vol_df['bin_volume'] = vol_df['bin_volume'].fillna(1)  # 空值用1填充
    vol_df['daily_volume'] = vol_df['daily_volume'].fillna(method = 'bfill')  # 空值用向上填充
    return vol_df

In [46]:
def read_stock_data_all(data_home, data_type, venue, start_date, end_date, ticker, bin_number, is_filter):
    '''
    读取一只股票所有日期的数据，from start_date to end_date
    '''

    data_concat = pd.DataFrame(columns=['date', 'daily_volume', 'bin_num', 'bin_volume'])

    start_date1 = datetime.datetime.strptime(start_date, '%Y-%m-%d')
    end_date1 = datetime.datetime.strptime(end_date, '%Y-%m-%d')
    interval_day = (end_date1-start_date1).days

    ##遍历日期

    for i in range(interval_day+1):
        date = datetime.datetime.strptime(start_date,'%Y-%m-%d') + datetime.timedelta(days=i)

        date2 = datetime.datetime.strftime(date, '%Y-%m-%d')
        year = date2[0:4]
        month = date2[5:7]
        day = date2[8:10]

        if len(str(month)) < 2:
            month = str(0) + str(month)
        else:
            month = str(month)
        if len(str(day)) < 2:
            day = str(0) + str(day)
        else:
            day = str(day)

        dirs = str(data_home) + '/' + str(data_type) + '/' + str(venue) + '/' + str(year) + '/' + str(
                    month) + '/' + str(day) + '/'


        if not (os.path.exists(dirs)):
            continue
        else:
            print(year, month, day)
            data = get_df(data_home, data_type, venue, year, month, day, ticker, bin_number, is_filter)
            frames = [data_concat, data]
            data_concat = pd.concat(frames)  # 将一只股票多天的数据合并到一个数据框里

    return data_concat.reset_index(drop=True)  # 返回合并后的数据框并重新设置下标


In [18]:
data_home = '/volume1/sinoalgo/data/sinoalgo/JQMarketData'
data_types = [ 'STOCK', 'STOCK']
venues = [
          'XSHE', 'XSHE']
tickers = [ 
           '002988']
start_date = "2020-03-18"
end_date = "2021-1-10"
bin_num = 25
def data_generating_all(data_home, data_types, venues, tickers,start_date,end_date,bin_num, is_filter=0):
    data_home = data_home
    for i in range(len(data_types)):
        data_type = data_types[i]
        venue = venues[i]
        ticker = tickers[i]
        result_df = read_stock_data_all(data_home, data_type, venue, start_date, end_date, ticker, bin_num, is_filter=0)
        filename_basic = '/volume1/home/rzhu/新股票/数据/' + str(ticker) + '_' + str(venue) + '_' + str(
            bin_num) +'_'+'daily.csv'
        result_df.to_csv(filename_basic, index=False)

data_generating_all(data_home,data_types,venues,tickers,start_date,end_date,bin_num,is_filter=0)

2020 03 18


FileNotFoundError: [Errno 2] No such file or directory: '/volume1/sinoalgo/data/sinoalgo/JQMarketData/STOCK/XSHE/2020/03/18/002988.XSHE.gz'

In [19]:
# 给定的股票列表
stocks = [
    '000725.XSHE', '300059.XSHE', '300750.XSHE', '000049.XSHE', '300016.XSHE',
    '000032.XSHE', '002046.XSHE', '003017.XSHE', '300023.XSHE', '000005.XSHE',
    '000725.XSHE', '300059.XSHE', '300750.XSHE', '000049.XSHE', '000026.XSHE',
    '002075.XSHE', '300016.XSHE', '000032.XSHE', '002046.XSHE', '002651.XSHE',
    '000004.XSHE', '300180.XSHE', '003017.XSHE', '300023.XSHE', '000005.XSHE',
    '000407.XSHE', '000020.XSHE', '300649.XSHE', '300750.XSHE', '000049.XSHE',
    '000032.XSHE', '003017.XSHE', '002801.XSHE', '000504.XSHE', '000725.XSHE',
    '002046.XSHE', '000005.XSHE', '300116.XSHE', '000036.XSHE', '000958.XSHE',
    '300059.XSHE', '300016.XSHE', '300023.XSHE', '300056.XSHE', '000603.XSHE',
    '300162.XSHE'
]

# 整理成指定形式的股票列表，并去除重复
formatted_stocks = list(set([stock.split('.')[0] for stock in stocks]))

# 打印整理后的股票列表
print(formatted_stocks)


['300059', '000603', '300750', '000026', '000407', '300023', '000725', '300116', '000958', '003017', '300180', '300162', '002046', '000004', '000036', '000032', '002075', '002801', '002651', '000504', '000005', '000049', '300056', '300016', '000020', '300649']


In [20]:
formatted_stocks.sort()

In [23]:
formatted_stocks

['000004',
 '000005',
 '000020',
 '000026',
 '000032',
 '000036',
 '000049',
 '000407',
 '000504',
 '000603',
 '000725',
 '000958',
 '002046',
 '002075',
 '002651',
 '002801',
 '003017',
 '300016',
 '300023',
 '300056',
 '300059',
 '300116',
 '300162',
 '300180',
 '300649',
 '300750']

In [24]:
['000004',
 '000005',
 '000009',
 '000026',
 '000032',
 '000036',
 '000049',
 '000407',
 '000504',
 '000603',
 '000617',
 '000725',
 '000958',
 '002046',
 '002651',
 '002781',
 '002801',
 '300016',
 '300023',
 '300056',
 '300059',
 '300116',
 '300162',
 '300180',
 '300810',
 '300750']

['000004',
 '000005',
 '000009',
 '000026',
 '000032',
 '000036',
 '000049',
 '000407',
 '000504',
 '000603',
 '000617',
 '000725',
 '000958',
 '002046',
 '002651',
 '002781',
 '002801',
 '300016',
 '300023',
 '300056',
 '300059',
 '300116',
 '300162',
 '300180',
 '300810',
 '300750']

In [7]:
input_tickers = ['000004',
 '000005',
 '000009',
 '000026',
 '000032',
 '000036',
 '000049',
 '000407',
 '000603',
 '000617',
 '000725',
 '000958',
 '002046',
 '002651',
 '002780',
 '002801',
 '300016',
 '300023',
 '300059',
 '300116',
 '300162',
 '300180',
 '300810',
 '300750']
input_path='/volume1/home/rzhu/新股票/数据/处理后的/'
for ticker in input_tickers:
    df = pd.read_csv(input_path + ticker + '_' + 'XSHE' + '_25_daily.csv')
    dates_to_remove = df[df['quote_imbalance'].isin([1, -1])]['date'].unique()
    df = df[~df['date'].isin(dates_to_remove)]
    df.to_csv('/volume1/home/rzhu/新股票/数据/去除涨跌停/' + ticker + '_' + 'XSHE' + '_25_daily.csv', index=False)