In [1]:
import polars as pl

# Data preprocessing

- Binarize Operation type

In [2]:
market_data: pl.DataFrame = pl.read_csv("../data/MOEX_SBER_20241123_20241130.csv").drop("ID")

# Binarize Operation type
market_data = market_data.with_columns(
    SELL=pl.col("OPER") == "S",
    BUY=pl.col("OPER") == "B",
).drop(["OPER"])

market_data.head(3)

DATE,TIME,LAST,VOL,SELL,BUY
i64,str,f64,i64,bool,bool
20241125,"""09:59:43""",236.0,80,False,True
20241125,"""09:59:43""",236.0,120,False,True
20241125,"""09:59:43""",236.0,20,False,True


In [3]:
# Combine DATE and TIME into a DATETIME column
market_data = market_data.with_columns(
    # Convert DATE to a Date type (YYYY-MM-DD)
    DATE=pl.col("DATE").cast(pl.String).str.to_date(format="%Y%m%d").cast(pl.Date),
    # Convert TIME to a Time type (HH:MM:SS)
    TIME=pl.col("TIME").str.strptime(pl.Time, format="%H:%M:%S")
)

market_data = market_data.with_columns(
    # Combine the two into a DATETIME
    DATETIME=pl.col("DATE").dt.combine(pl.col("TIME")).alias("d1"),
).drop(["TIME", "DATE"])

In [4]:
market_data.head(3)

LAST,VOL,SELL,BUY,DATETIME
f64,i64,bool,bool,datetime[μs]
236.0,80,False,True,2024-11-25 09:59:43
236.0,120,False,True,2024-11-25 09:59:43
236.0,20,False,True,2024-11-25 09:59:43


- Compute Buy and Sell volumes
- Compute Mid Price for each data point as the mean between best bid and best ask

In [5]:
market_data = market_data.group_by(pl.col("DATETIME"), maintain_order=True
    ).agg(
    (pl.col("VOL") * pl.col("SELL")).sum().alias("SELL_VOLUME"),
    (pl.col("VOL") * pl.col("BUY")).sum().alias("BUY_VOLUME"),
    (pl.col("SELL") * pl.col("LAST")).filter(pl.col("SELL") == True).max().alias("BEST_BID"),
    (pl.col("BUY") * pl.col("LAST")).filter(pl.col("BUY") == True).min().alias("BEST_ASK")
)

In [6]:
market_data = market_data.with_columns(
    pl.when(( ~pl.col("BEST_ASK").is_nan() ) & (~pl.col("BEST_BID").is_nan()))
      .then
        ((pl.col("BEST_ASK") + pl.col("BEST_BID")) / 2)
      .otherwise
        (pl.coalesce(pl.col("BEST_ASK"), pl.col("BEST_BID"))).alias("MID_PX")
)

In [7]:
market_data

DATETIME,SELL_VOLUME,BUY_VOLUME,BEST_BID,BEST_ASK,MID_PX
datetime[μs],i64,i64,f64,f64,f64
2024-11-25 09:59:43,68290,43250,236.0,236.0,236.0
2024-11-25 10:00:00,4850,2190,235.99,236.0,235.995
2024-11-25 10:00:01,120,2860,235.98,236.0,235.99
2024-11-25 10:00:02,0,470,,236.0,236.0
2024-11-25 10:00:03,1600,110,235.99,236.0,235.995
…,…,…,…,…,…
2024-11-29 23:49:49,500,1400,236.46,236.49,236.475
2024-11-29 23:49:52,10000,0,236.43,,236.43
2024-11-29 23:49:53,10,0,236.44,,236.44
2024-11-29 23:49:56,0,510,,236.49,236.49


In [8]:
market_data.write_csv("../data/train.csv")

In [9]:
## Interpolate missing values
# # Get all timeframes between the min and max time
# time_range = pl.DataFrame({
#     "DATETIME": pl.datetime_range(
#         start=market_data["DATETIME"].min(), end=market_data["DATETIME"].max(), interval="1s", eager=True
#     )
# # })

# market_data = time_range.join(market_data, on="DATETIME", how="left")

# market_data = market_data.with_columns([
#     pl.col("SELL_VOLUME").fill_null(0),
#     pl.col("BUY_VOLUME").fill_null(0),
#     pl.col("MID_PX").forward_fill()
# ])