In [3]:
import os
import sys
import pandas as pd
import time
import matplotlib.pyplot as plt
import plotly.express as px

from pathlib import Path
ROOT_DIR=Path(os.getcwd()).parent.parent
if not str(ROOT_DIR) in sys.path:
    sys.path.append(str(ROOT_DIR))

In [4]:
# turn off warnings
import warnings
warnings.filterwarnings("ignore")
from src.data.equity_data.tradingview import BigA
biga = BigA()

In [5]:
stocks, etfs, foreign_etf = biga.load_data_cache_from_csv(dt='2024-01-12')

# 使用 SQL 格式读写

In [6]:
import sqlite3
db_path = ROOT_DIR / 'database' / 'China_A_Share.db'
conn = sqlite3.connect(db_path)

In [7]:
# read data from database and time profling
start = time.time()
history_data = pd.read_sql("SELECT * FROM history_quote", conn)
end = time.time()
print('从本地SQL数据库读取所有数据用时: ', round(end - start,2), '秒')

从本地SQL数据库读取所有数据用时:  54.75 秒


In [8]:
print("数据大小为: ", history_data.shape)

数据大小为:  (13768777, 13)


# 使用 pickle 文件格式读写数据

In [9]:
pickle_file_path = ROOT_DIR/'data'/ 'equity_market' / '2_equity_hist_price' / 'a_share_hist_price.pickle'

start = time.time()
history_data.to_pickle(pickle_file_path)
end = time.time()
load_time_pickle = end - start

# the size of the pickle file in GB
import os
size_pickle=round(os.path.getsize(pickle_file_path)/1024/1024/1024, 2)

# load the pickle file, and profile the time
start = time.time()
history_data_from_pickle = pd.read_pickle(pickle_file_path)
end = time.time()
read_time_pickle = end - start

print(f"pickle文件大小为{size_pickle}GB")
print(f"写入pickle文件耗时{round(load_time_pickle, 2)}秒")
print(f"读取pickle文件耗时{round(read_time_pickle, 2)}秒")

pickle文件大小为1.49GB
写入pickle文件耗时10.94秒
读取pickle文件耗时4.35秒


# 使用 Parquet 文件格式读写

In [10]:
file_path_parquet=ROOT_DIR/'data'/ 'equity_market' / '2_equity_hist_price' / 'a_share_hist_price.parquet' 

# write to parquet file with time profiling
start = time.time()
history_data.to_parquet(file_path_parquet)
end = time.time()
load_time_parquet = end - start

# get the size of the file in GB
size_parquet=round(os.path.getsize(file_path_parquet)/1024/1024/1024, 2)

# read the parquet file with time profiling
start = time.time()
history_data_from_parquet = pd.read_parquet(file_path_parquet)
end = time.time()
read_time_parquet = end - start

print(f"parquet文件大小为{size_parquet}GB")
print(f"写入parquet文件耗时{round(load_time_parquet, 2)}秒")
print(f"读取parquet文件耗时{round(read_time_parquet, 2)}秒")

parquet文件大小为0.33GB
写入parquet文件耗时4.84秒
读取parquet文件耗时1.11秒


# 使用 xarray 读写 NetCDF 格式文件
xarray是一个Python库，旨在为处理多维数组数据提供强大的工具和数据结构。它特别适用于处理科学数据，例如气象数据、地理信息系统（GIS）数据、气候模拟和地球科学数据等。xarray的主要功能包括：

- 数据数组：xarray引入了DataArray数据结构，允许您轻松管理多维数组数据，并为每个维度和坐标轴提供描述性标签。
- 数据集：Dataset是xarray的另一个关键数据结构，它可以容纳多个DataArray，使您能够组织和分析复杂的数据集。
- 坐标轴标签：xarray允许您为每个维度添加可描述性的标签，这使得数据处理更加直观和易于理解。
- 灵活的数据索引：您可以使用坐标轴标签轻松访问和切片数据，而不需要手动计算索引位置。
- 内置的统计和计算功能：xarray提供了许多内置函数，用于执行各种统计和数学运算，使数据分析更加方便。

In [11]:
netcdf_file_path = ROOT_DIR / 'data' / 'equity_market' / '2_equity_hist_price' / 'a_share_hist_price.nc'

# write to netcdf file with time profiling
start = time.time()
history_data_xarray = history_data.rename(
    columns={
        '股票名称': 'stock_name',
        '股票代码': 'ticker',
        '日期': 'date',
        '开盘': 'open',
        '收盘': 'close',
        '最高': 'high',
        '最低': 'low',
        '成交量': 'volume',
        '成交额': 'volume*price',
        '振幅': 'amplitude',
        '涨跌幅': 'change_pct',
        '涨跌额': 'change',
        '换手率': 'turnover'
    }
).to_xarray()
history_data_xarray.to_netcdf(netcdf_file_path)
end = time.time()
load_time_netcdf = end - start

# get the size of the file in GB
size_netcdf=round(os.path.getsize(netcdf_file_path)/1024/1024/1024, 2)

# read the netcdf file with time profiling
start = time.time()
import xarray as xr
history_data_from_netcdf = xr.open_dataset(netcdf_file_path)
end = time.time()
read_time_netcdf = end - start

print(f"netcdf文件大小为{size_netcdf}GB")
print(f"写入netcdf文件耗时{round(load_time_netcdf, 2)}秒")
print(f"读取netcdf文件耗时{round(read_time_netcdf, 2)}秒")

netcdf文件大小为1.42GB
写入netcdf文件耗时15.4秒
读取netcdf文件耗时0.17秒
