In [1]:
import pandas as pd
import json

# Path to your .data file
file_path = "/home/ubuntu/Rheza/local-share/06_trades_and_orderbooks/2025-01-28_ETHUSDT_ob500.data"

# Initialize a list to store valid JSON rows
data_list = []

# Step 1: Read the file line by line and parse JSON
with open(file_path, 'r') as file:
    for line in file:
        try:
            # Parse each line as JSON
            data_list.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Skipping malformed line: {line.strip()} | Error: {e}")

# Step 2: Convert the valid JSON data to a DataFrame
df = pd.DataFrame(data_list)

# Step 3: Extract and normalize the 'data' field
data_df = pd.json_normalize(df['data'])

# Step 4: Combine relevant fields with the main DataFrame
df = pd.concat([df[['ts', 'type', 'topic']], data_df], axis=1)

# Step 5: Check for missing or invalid entries in 'b' and 'a'
df['b'] = df['b'].apply(lambda x: x if isinstance(x, list) else [])
df['a'] = df['a'].apply(lambda x: x if isinstance(x, list) else [])

# Step 6: Expand 'b' (bids) and 'a' (asks) into individual DataFrames
bids_df = pd.DataFrame(df['b'].explode().dropna().tolist(), columns=['bid_price', 'bid_size'])
asks_df = pd.DataFrame(df['a'].explode().dropna().tolist(), columns=['ask_price', 'ask_size'])

# Step 7: Add the symbol and timestamp to bids/asks DataFrames for clarity
bids_df['symbol'] = df['s'].iloc[0] if 's' in df.columns else None
asks_df['symbol'] = df['s'].iloc[0] if 's' in df.columns else None
bids_df['timestamp'] = df['ts'].iloc[0]
asks_df['timestamp'] = df['ts'].iloc[0]

# Display the processed DataFrames
print("Bids DataFrame:")
print(bids_df.head())

print("\nAsks DataFrame:")
print(asks_df.head())

Skipping malformed line: {"topic":"orderbook.500.ETHUSDT","type":"delta","ts":1738043314603,"data":{"s":"ETHUSDT","b":[["3220.41","0"],["3220.40","0"],["3220.37","0"],["3220.36","0"],["3220.34","0"],["3220.30","0"],["3220.27","0"],["3220.25","0"],["3220.24","0"],["3220.22","0"],["3220.21","0"],["3220.20","0"],["3220.18","0"],["3220.15","30.08"],["3220.12","0.79"],["3220.11","9.75"],["3220.08","0.46"],["3220.07","0.31"],["3220.06","0.06"],["3219.98","5.40"],["3219.90","3.70"],["3219.88","0.41"],["3219.87","1.98"],["3219.86","0.20"],["3219.81","1.05"],["3219.78","0"],["3219.77","0.01"],["3219.76","13.17"],["3219.73","0.01"],["3219.72","0"],["3219.67","22.54"],["3219.66","1.63"],["3219.62","0"],["3219.61","1.13"],["3219.60","13.87"],["3219.54","0.03"],["3219.53","12.49"],["3219.51","3.89"],["3219.50","0.02"],["3219.44","0.34"],["3219.41","0"],["3219.40","12.28"],["3219.39","2.63"],["3219.38","5.18"],["3219.36","37.97"],["3219.33","9.93"],["3219.30","2.24"],["3219.29","13.99"],["3219.26","

In [2]:
import pandas as pd
import ast

# Helper function to calculate the metrics
def calculate_orderbook_metrics(orderbook_data):
    # Convert the string to a list (if it's stored as a string representation of a list)
    orderbook_data = ast.literal_eval(orderbook_data) if isinstance(orderbook_data, str) else orderbook_data
    
    # Ensure the bid data contains numerical values (floats)
    orderbook_data = [(float(bid[0]), float(bid[1])) for bid in orderbook_data]

    total_bid_value = sum(bid[0] * bid[1] for bid in orderbook_data)  # bid_price * bid_size
    total_bid_size = sum(bid[1] for bid in orderbook_data)  # bid_size
    middle_bid_price = total_bid_value / total_bid_size if total_bid_size > 0 else None
    
    # Find the largest bid price and its corresponding size
    largest_bid = max(orderbook_data, key=lambda x: x[0]) if orderbook_data else None
    largest_bid_price, largest_bid_size = largest_bid if largest_bid else (None, None)
    
    # Find the largest bid size and its corresponding price
    largest_bid_size_data = max(orderbook_data, key=lambda x: x[1]) if orderbook_data else None
    largest_bid_size_price, largest_bid_size_value = largest_bid_size_data if largest_bid_size_data else (None, None)
    
    return middle_bid_price, largest_bid_price, largest_bid_size, largest_bid_size_price, largest_bid_size_value

def calculate_top_5_bid_ask_size(orderbook_data):
    # Convert the string to a list (if it's stored as a string representation of a list)
    orderbook_data = ast.literal_eval(orderbook_data) if isinstance(orderbook_data, str) else orderbook_data
    
    # Ensure the bid data contains numerical values (floats)
    orderbook_data = [(float(bid[0]), float(bid[1])) for bid in orderbook_data]
    
    # Sort the bids and asks by size in descending order
    orderbook_data_sorted = sorted(orderbook_data, key=lambda x: x[1], reverse=True)
    
    # Calculate the sum of the sizes of the top 5 bids and asks
    top_5_size_sum = sum(bid[1] for bid in orderbook_data_sorted[:5])
    
    return top_5_size_sum

def process_orderbook_data(df):
    # Initialize lists to store the results
    middle_bid_prices = []
    largest_bid_prices = []
    largest_bid_sizes = []
    largest_bid_size_prices = []
    largest_bid_size_values = []
    top_5_bid_sizes = []
    top_5_ask_sizes = []
    
    middle_ask_prices = []
    largest_ask_prices = []
    largest_ask_sizes = []
    largest_ask_size_prices = []
    largest_ask_size_values = []
    
    # Iterate over each row in the DataFrame and apply the helper functions
    for index, row in df.iterrows():
        # Process bids ('b')
        middle_bid_price, largest_bid_price, largest_bid_size, largest_bid_size_price, largest_bid_size_value = calculate_orderbook_metrics(row['b'])
        # Process asks ('a')
        middle_ask_price, largest_ask_price, largest_ask_size, largest_ask_size_price, largest_ask_size_value = calculate_orderbook_metrics(row['a'])
        
        # Calculate top 5 bid and ask size sums
        top_5_bid_size = calculate_top_5_bid_ask_size(row['b'])
        top_5_ask_size = calculate_top_5_bid_ask_size(row['a'])
        
        # Append results for bids and asks
        middle_bid_prices.append(middle_bid_price)
        largest_bid_prices.append(largest_bid_price)
        largest_bid_sizes.append(largest_bid_size)
        largest_bid_size_prices.append(largest_bid_size_price)
        largest_bid_size_values.append(largest_bid_size_value)
        top_5_bid_sizes.append(top_5_bid_size)
        
        middle_ask_prices.append(middle_ask_price)
        largest_ask_prices.append(largest_ask_price)
        largest_ask_sizes.append(largest_ask_size)
        largest_ask_size_prices.append(largest_ask_size_price)
        largest_ask_size_values.append(largest_ask_size_value)
        top_5_ask_sizes.append(top_5_ask_size)
    
    # Create a new DataFrame with all the results
    new_df = pd.DataFrame({
        'ts': df['ts'],
        'middle_bid_price': middle_bid_prices,
        'largest_bid_price': largest_bid_prices,
        'largest_bid_size': largest_bid_sizes,
        'largest_bid_size_price': largest_bid_size_prices,
        'largest_bid_size_value': largest_bid_size_values,
        'top_5_bid_size_sum': top_5_bid_sizes,
        'middle_ask_price': middle_ask_prices,
        'largest_ask_price': largest_ask_prices,
        'largest_ask_size': largest_ask_sizes,
        'largest_ask_size_price': largest_ask_size_prices,
        'largest_ask_size_value': largest_ask_size_values,
        'top_5_ask_size_sum': top_5_ask_sizes
    })
    
    return new_df

# Process the DataFrame and calculate the metrics
df_processed = process_orderbook_data(df)

# Display the updated DataFrame with all the calculated metrics
df_processed

Unnamed: 0,ts,middle_bid_price,largest_bid_price,largest_bid_size,largest_bid_size_price,largest_bid_size_value,top_5_bid_size_sum,middle_ask_price,largest_ask_price,largest_ask_size,largest_ask_size_price,largest_ask_size_value,top_5_ask_size_sum
0,1738022401901,3177.489548,3181.17,58.49,3177.54,304.35,873.39,3184.274607,3187.12,0.03,3184.49,316.35,722.76
1,1738022401903,3179.276930,3181.17,58.50,3178.67,78.75,190.14,3183.181742,3187.12,0.00,3183.77,83.09,199.68
2,1738022402001,3177.998707,3180.62,14.81,3178.73,84.45,173.95,3182.933566,3187.10,0.00,3183.83,89.76,178.36
3,1738022402101,3179.643723,3181.17,58.49,3178.71,79.80,187.72,3183.387996,3186.83,1.27,3183.82,82.07,147.11
4,1738022402201,3178.793844,3181.17,58.50,3178.62,80.93,279.20,3183.300678,3187.10,1.96,3183.81,78.61,126.94
...,...,...,...,...,...,...,...,...,...,...,...,...,...
209121,1738043314101,3218.504132,3220.70,0.31,3217.99,79.92,184.42,3222.404937,3225.29,3.51,3220.71,82.45,264.01
209122,1738043314201,3218.233476,3220.70,0.51,3216.00,37.34,90.62,3222.409038,3226.18,0.00,3220.71,82.14,273.78
209123,1738043314302,3217.709445,3220.44,0.00,3216.57,115.05,325.35,3221.625508,3224.99,5.06,3220.71,91.80,269.73
209124,1738043314401,3218.338262,3220.22,1.19,3217.71,77.62,186.97,3221.781415,3223.66,0.13,3220.71,93.46,312.25


In [5]:
import pandas as pd

# Assuming df_processed is your original DataFrame and target_ts_list is the list of target timestamps
target_ts_list = list(range(1738022405000, 1738043315000, 5000))

# Initialize a list to store the rows with the nearest timestamp and corresponding data
nearest_rows = []

# Iterate over each target timestamp in the list
for target_ts in target_ts_list:
    # Calculate the absolute difference between the target timestamp and the timestamps in df_processed
    df_processed['ts_diff'] = abs(df_processed['ts'] - target_ts)
    
    # Find the index of the row with the smallest ts_diff (i.e., nearest timestamp)
    nearest_index = df_processed['ts_diff'].idxmin()
    
    # Append the corresponding row to the nearest_rows list
    nearest_rows.append(df_processed.loc[nearest_index].drop(columns=['ts_diff']))
    
    # Drop the ts_diff column to keep the DataFrame clean
    df_processed.drop(columns=['ts_diff'], inplace=True)

# Convert the list of nearest rows to a DataFrame
nearest_df = pd.DataFrame(nearest_rows)

# Add the target timestamp from the list as the 'ts' column
nearest_df['ts'] = target_ts_list

# Set the display option to show full precision for float numbers except for the 'ts' column
pd.set_option('display.float_format', '{:.2f}'.format)

# Display the updated DataFrame
nearest_df

Unnamed: 0,ts,middle_bid_price,largest_bid_price,largest_bid_size,largest_bid_size_price,largest_bid_size_value,top_5_bid_size_sum,middle_ask_price,largest_ask_price,largest_ask_size,largest_ask_size_price,largest_ask_size_value,top_5_ask_size_sum,ts_diff
32,1738022405000,3178.94,3181.17,76.01,3176.36,122.18,327.74,3184.13,3187.26,0.00,3185.93,120.35,267.16,1.00
82,1738022410000,3177.35,3180.38,0.90,3176.27,87.75,231.18,3181.21,3186.37,0.00,3180.39,81.88,178.54,1.00
132,1738022415000,3178.93,3180.70,4.38,3177.86,95.41,196.24,3181.92,3186.71,0.01,3182.96,88.56,184.76,1.00
182,1738022420000,3179.20,3180.84,1.80,3178.07,78.60,168.89,3182.66,3186.76,0.01,3180.85,97.59,350.43,2.00
232,1738022425000,3179.47,3181.19,55.79,3178.78,81.55,346.85,3184.41,3187.21,1.19,3186.97,277.00,523.81,5.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208880,1738043290000,3217.54,3219.09,33.60,3216.55,77.83,253.24,3220.61,3224.83,0.08,3219.10,89.26,225.36,1.00
208930,1738043295000,3217.57,3218.72,40.52,3216.56,77.73,270.98,3221.25,3224.64,0.01,3221.72,81.87,243.72,2.00
208980,1738043300000,3216.59,3218.94,0.00,3215.98,327.86,604.25,3220.66,3224.68,0.00,3221.47,79.20,227.28,1.00
209030,1738043305000,3217.36,3219.44,8.61,3217.37,78.29,123.82,3221.45,3225.56,0.00,3222.53,77.67,186.46,1.00


In [6]:
# Assuming nearest_df is your DataFrame
nearest_df.to_csv('cleaned_ob_20250128.csv', index=False)