In [2]:
import pandas as pd

dtype_mapping = {
    'Symbol': 'category',  # Categorical for symbols (e.g., stock tickers)
    'Open': 'float64',  # OHLC as float
    'High': 'float64',
    'Low': 'float64',
    'Close': 'float64',
    'Volume': 'int64'  # Volume typically as integer
}


# Load the 1-minute OHLC data and set the first row as column names
data = pd.read_csv('./tick_data/barchart.com/concatenated_barchart_data3.csv', header=0, dtype=dtype_mapping
)
# make sure no commas for thousands etc


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 707945 entries, 0 to 707944
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype   
---  ------  --------------   -----   
 0   Time    707945 non-null  object  
 1   Symbol  707945 non-null  category
 2   Open    707945 non-null  float64 
 3   High    707945 non-null  float64 
 4   Low     707945 non-null  float64 
 5   Close   707945 non-null  float64 
 6   Volume  707945 non-null  int64   
dtypes: category(1), float64(4), int64(1), object(1)
memory usage: 33.1+ MB


In [4]:
data.head()


Unnamed: 0,Time,Symbol,Open,High,Low,Close,Volume
0,01/01/2024 23:00:00,ESH24,4818.0,4819.5,4815.75,4818.75,1577
1,01/01/2024 23:01:00,ESH24,4818.75,4819.75,4818.0,4819.75,783
2,01/01/2024 23:02:00,ESH24,4819.75,4820.75,4819.25,4819.5,531
3,01/01/2024 23:03:00,ESH24,4819.5,4820.0,4819.25,4820.0,119
4,01/01/2024 23:04:00,ESH24,4819.75,4819.75,4818.5,4818.75,190


In [None]:
volume_bars = []
VOLUME_THRESHOLD = 50000
# Initialize variables for aggregation
current_open = None
current_high = -float('inf')
current_low = float('inf')
current_close = None
current_volume = 0
current_symbol = None
current_time = None

# Iterate through rows to build volume bars
for index, row in data.iterrows():
    if current_open is None:
        current_open = row['Open']
        current_symbol = row['Symbol']
        current_time = row['Time']  # Capture the current time
    current_high = max(current_high, row['High'])
    current_low = min(current_low, row['Low'])
    current_close = row['Close']
    current_volume += row['Volume']

    if current_volume >= VOLUME_THRESHOLD:
        volume_bars.append({
            'Symbol': current_symbol,
            'Open': current_open,
            'High': current_high,
            'Low': current_low,
            'Close': current_close,
            'Volume': current_volume,
            'Time': current_time  # Add time to the volume bar
        })
        current_open = None
        current_high = -float('inf')
        current_low = float('inf')
        current_close = None
        current_volume = 0
        current_symbol = None
        current_time = None

volume_bars_df = pd.DataFrame(volume_bars)

In [None]:
volume_bars_df.info()



In [1]:
volume_bars_df['Time'] = pd.to_datetime(volume_bars_df['Time'])
volume_bars_df.to_csv('tick_data/bars/volume_bars_barchart.csv', index=False)


NameError: name 'pd' is not defined

In [5]:
dollar_bars = []
DOLLAR_THRESHOLD = 1000000  # Example threshold for dollar bars
# Initialize variables for aggregation
current_open = None
current_high = -float('inf')
current_low = float('inf')
current_close = None
current_dollar_volume = 0
current_symbol = None
current_time = None

# Iterate through rows to build dollar bars
for index, row in data.iterrows():
    if current_open is None:
        current_open = row['Open']
        current_symbol = row['Symbol']
        current_time = row['Time']  # Capture the current time
    current_high = max(current_high, row['High'])
    current_low = min(current_low, row['Low'])
    current_close = row['Close']
    current_dollar_volume += row['Close'] * row['Volume']

    if current_dollar_volume >= DOLLAR_THRESHOLD:
        dollar_bars.append({
            'Symbol': current_symbol,
            'Open': current_open,
            'High': current_high,
            'Low': current_low,
            'Close': current_close,
            'DollarVolume': current_dollar_volume,
            'Time': current_time  # Add time to the dollar bar
        })
        current_open = None
        current_high = -float('inf')
        current_low = float('inf')
        current_close = None
        current_dollar_volume = 0
        current_symbol = None
        current_time = None

dollar_bars_df = pd.DataFrame(dollar_bars)
dollar_bars_df['Time'] = pd.to_datetime(dollar_bars_df['Time'])
dollar_bars_df.to_csv('tick_data/bars/dollar_bars_barchart.csv', index=False)