# Read Raw

In [21]:
import pandas as pd

# Replace 'your_file.csv.gz' with the path to your .csv.gz file
df = pd.read_csv('/home/ubuntu/Rheza/local-share/06_trades_and_orderbooks/BTCUSDT2025-01-11.csv.gz', compression='gzip')

# Convert the 'timestamp' column to integers
df['timestamp'] = df['timestamp'] * 1000
df['timestamp'] = df['timestamp'].astype(int)

# Drop all columns after 'price'
df = df[['timestamp','side','size','price', 'tickDirection']]

# Display the first few rows of the dataframe
df

Unnamed: 0,timestamp,side,size,price,tickDirection
0,1736553600326,Buy,0.001,94684.3,PlusTick
1,1736553600781,Sell,0.082,94684.2,MinusTick
2,1736553600831,Sell,0.001,94684.2,ZeroMinusTick
3,1736553601018,Sell,0.001,94684.2,ZeroMinusTick
4,1736553601022,Sell,0.002,94684.2,ZeroMinusTick
...,...,...,...,...,...
677407,1736639994670,Sell,0.100,94569.4,MinusTick
677408,1736639997157,Sell,0.010,94569.4,ZeroMinusTick
677409,1736639998097,Sell,0.001,94569.4,ZeroMinusTick
677410,1736639998421,Sell,0.001,94569.4,ZeroMinusTick


In [23]:
# df.to_csv('cleaned_trades_BTCUSDT2025-01-11.csv', index=False)

# Read Simplified Data

In [24]:
import pandas as pd

# Replace 'your_file.csv.gz' with the path to your .csv.gz file
df = pd.read_csv('/home/ubuntu/Rheza/local-share/06_trades_and_orderbooks/cleaned_trades_BTCUSDT2025-01-11.csv')

df

Unnamed: 0,timestamp,side,size,price,tickDirection
0,1736553600326,Buy,0.001,94684.3,PlusTick
1,1736553600781,Sell,0.082,94684.2,MinusTick
2,1736553600831,Sell,0.001,94684.2,ZeroMinusTick
3,1736553601018,Sell,0.001,94684.2,ZeroMinusTick
4,1736553601022,Sell,0.002,94684.2,ZeroMinusTick
...,...,...,...,...,...
677407,1736639994670,Sell,0.100,94569.4,MinusTick
677408,1736639997157,Sell,0.010,94569.4,ZeroMinusTick
677409,1736639998097,Sell,0.001,94569.4,ZeroMinusTick
677410,1736639998421,Sell,0.001,94569.4,ZeroMinusTick


# Data Cleaning

In [25]:
df

Unnamed: 0,timestamp,side,size,price,tickDirection
0,1736553600326,Buy,0.001,94684.3,PlusTick
1,1736553600781,Sell,0.082,94684.2,MinusTick
2,1736553600831,Sell,0.001,94684.2,ZeroMinusTick
3,1736553601018,Sell,0.001,94684.2,ZeroMinusTick
4,1736553601022,Sell,0.002,94684.2,ZeroMinusTick
...,...,...,...,...,...
677407,1736639994670,Sell,0.100,94569.4,MinusTick
677408,1736639997157,Sell,0.010,94569.4,ZeroMinusTick
677409,1736639998097,Sell,0.001,94569.4,ZeroMinusTick
677410,1736639998421,Sell,0.001,94569.4,ZeroMinusTick


In [26]:
import numpy as np

# Check for NaN values
nan_counts = df.isna().sum()

# Check for infinite values
inf_counts = np.isinf(df[['price', 'size']]).sum()

# Check for zero values in 'price' and 'size'
zero_counts = (df[['price', 'size']] == 0).sum()

# Display results
print("NaN counts:\n", nan_counts)
print("\nInf counts:\n", inf_counts)
print("\nZero counts:\n", zero_counts)

NaN counts:
 timestamp        0
side             0
size             0
price            0
tickDirection    0
dtype: int64

Inf counts:
 price    0
size     0
dtype: int64

Zero counts:
 price    0
size     0
dtype: int64


# Feature Engineering

In [30]:
df_features = df.copy()

df_features['quoteprice'] = df_features['size'] * df_features['price']
df_features

Unnamed: 0,timestamp,side,size,price,tickDirection,quoteprice
0,1736553600326,Buy,0.001,94684.3,PlusTick,94.6843
1,1736553600781,Sell,0.082,94684.2,MinusTick,7764.1044
2,1736553600831,Sell,0.001,94684.2,ZeroMinusTick,94.6842
3,1736553601018,Sell,0.001,94684.2,ZeroMinusTick,94.6842
4,1736553601022,Sell,0.002,94684.2,ZeroMinusTick,189.3684
...,...,...,...,...,...,...
677407,1736639994670,Sell,0.100,94569.4,MinusTick,9456.9400
677408,1736639997157,Sell,0.010,94569.4,ZeroMinusTick,945.6940
677409,1736639998097,Sell,0.001,94569.4,ZeroMinusTick,94.5694
677410,1736639998421,Sell,0.001,94569.4,ZeroMinusTick,94.5694


In [5]:
print(f'Min Timestamp: {df['timestamp'].min()}')
print(f'Max Timestamp: {df['timestamp'].max()}')

Min Timestamp: 1738022400064
Max Timestamp: 1738108799461


In [6]:
import pandas as pd
import numpy as np

# Ensure df_features is sorted by timestamp
df_features = df_features.sort_values(by="timestamp")

# Define the target timestamp bins (in milliseconds)
target_ts_list = list(range(1738022405000, 1738108805000, 5000))

# Initialize an empty list to store the results
results = []

for target_ts in target_ts_list:
    start_ts = target_ts - 5000  # Define the start of the window
    end_ts = target_ts  # Define the end of the window
    
    # Filter data within the time window
    df_window = df_features[(df_features["timestamp"] >= start_ts) & (df_features["timestamp"] < end_ts)]
    
    # Compute sum of buy and sell quoteprice
    sum_buy_quoteprice = df_window.loc[df_window["side"] == "Buy", "quoteprice"].sum()
    sum_sell_quoteprice = df_window.loc[df_window["side"] == "Sell", "quoteprice"].sum()
    
    # Compute sum of buy and sell size
    sum_buy_size = df_window.loc[df_window["side"] == "Buy", "size"].sum()
    sum_sell_size = df_window.loc[df_window["side"] == "Sell", "size"].sum()
    
    # Nearest price (i.e., the price of the row with the closest timestamp to the target timestamp)
    if not df_window.empty:
        nearest_idx = (df_window["timestamp"] - target_ts).abs().idxmin()
        nearest_price = df_window.loc[nearest_idx, "price"]
    else:
        nearest_price = np.nan
    
    # Buy price with the most sum of size
    if not df_window[df_window["side"] == "Buy"].empty:
        buy_price_most_size = df_window[df_window["side"] == "Buy"].groupby("price")["size"].sum().idxmax()
    else:
        buy_price_most_size = np.nan
    
    # Sell price with the most sum of size
    if not df_window[df_window["side"] == "Sell"].empty:
        sell_price_most_size = df_window[df_window["side"] == "Sell"].groupby("price")["size"].sum().idxmax()
    else:
        sell_price_most_size = np.nan

    # Append results to list
    results.append([
        target_ts, sum_buy_quoteprice, sum_sell_quoteprice, sum_buy_size, sum_sell_size,
        nearest_price, buy_price_most_size, sell_price_most_size
    ])

# Create DataFrame from results
df_summary = pd.DataFrame(results, columns=[
    "timestamp", "sum_buy_quoteprice", "sum_sell_quoteprice", "sum_buy_size", "sum_sell_size",
    "nearest_price", "buy_price_most_size", "sell_price_most_size"
])

# Display summary
df_summary

Unnamed: 0,timestamp,sum_buy_quoteprice,sum_sell_quoteprice,sum_buy_size,sum_sell_size,nearest_price,buy_price_most_size,sell_price_most_size
0,1738022405000,10816.0120,89359.0653,3.40,28.09,3181.18,3181.18,3181.17
1,1738022410000,207855.1386,149413.4592,65.34,46.97,3180.59,3181.18,3181.17
2,1738022415000,85201.5679,44175.9577,26.79,13.89,3180.73,3180.27,3180.37
3,1738022420000,67215.3038,53759.6228,21.13,16.90,3180.84,3181.18,3181.10
4,1738022425000,83908.5168,11164.6134,26.38,3.51,3181.17,3180.79,3180.84
...,...,...,...,...,...,...,...,...
17275,1738108780000,5507.1191,92.2959,1.79,0.03,3076.93,3076.54,3076.53
17276,1738108785000,4615.6650,5354.1540,1.50,1.74,3077.10,3077.11,3077.10
17277,1738108790000,2984.0676,104453.1112,0.97,33.95,3075.89,3076.62,3077.10
17278,1738108795000,8488.3328,6059.1430,2.76,1.97,3076.19,3075.38,3075.75


In [7]:
df_summary.to_csv('cleaned_trades_20250128.csv', index=False)