In [9]:
import pandas as pd

dtype_mapping = {
    'Symbol': 'category',  # Categorical for symbols (e.g., stock tickers)
    'Open': 'float64',  # OHLC as float
    'High': 'float64',
    'Low': 'float64',
    'Close': 'float64',
    'Volume': 'int64'  # Volume typically as integer
}
filename_end="2019_2024"

# Load the 1-minute OHLC data and set the first row as column names
data = pd.read_csv(f'./tick_data/barchart.com/concatenated_barchart_{filename_end}.csv', header=0, dtype=dtype_mapping
)
# make sure no commas for thousands etc
data['Date']=pd.to_datetime(data['Date'])

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2116299 entries, 0 to 2116298
Data columns (total 7 columns):
 #   Column  Dtype         
---  ------  -----         
 0   Date    datetime64[ns]
 1   Symbol  category      
 2   Open    float64       
 3   High    float64       
 4   Low     float64       
 5   Close   float64       
 6   Volume  int64         
dtypes: category(1), datetime64[ns](1), float64(4), int64(1)
memory usage: 98.9 MB


In [11]:
data.head()


Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume
0,2018-12-31 06:00:00,ESH19,2503.75,2504.25,2503.75,2504.25,71
1,2018-12-31 06:01:00,ESH19,2504.25,2504.25,2504.25,2504.25,18
2,2018-12-31 06:02:00,ESH19,2504.25,2504.25,2504.0,2504.25,6
3,2018-12-31 06:03:00,ESH19,2504.25,2504.5,2504.25,2504.25,81
4,2018-12-31 06:04:00,ESH19,2504.0,2504.25,2504.0,2504.0,47


In [12]:
volume_bars = []
VOLUME_THRESHOLD = 10000
# Initialize variables for aggregation
current_open = None
current_high = -float('inf')
current_low = float('inf')
current_close = None
current_volume = 0
current_symbol = None
current_time = None

# Iterate through rows to build volume bars
for index, row in data.iterrows():
    if current_open is None:
        current_open = row['Open']
        current_symbol = row['Symbol']
        current_time = row['Date']  # Capture the current time
    current_high = max(current_high, row['High'])
    current_low = min(current_low, row['Low'])
    current_close = row['Close']
    current_volume += row['Volume']

    if current_volume >= VOLUME_THRESHOLD:
        volume_bars.append({
            'Symbol': current_symbol,
            'Open': current_open,
            'High': current_high,
            'Low': current_low,
            'Close': current_close,
            'Volume': current_volume,
            'Date': current_time  # Add time to the volume bar
        })
        current_open = None
        current_high = -float('inf')
        current_low = float('inf')
        current_close = None
        current_volume = 0
        current_symbol = None
        current_time = None

volume_bars_df = pd.DataFrame(volume_bars)

In [13]:
volume_bars_df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191078 entries, 0 to 191077
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   Symbol  191078 non-null  object        
 1   Open    191078 non-null  float64       
 2   High    191078 non-null  float64       
 3   Low     191078 non-null  float64       
 4   Close   191078 non-null  float64       
 5   Volume  191078 non-null  int64         
 6   Date    191078 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 10.2+ MB


In [14]:
volume_bars_df['Date'] = pd.to_datetime(volume_bars_df['Date'])
volume_bars_df.to_csv(f'tick_data/bars/volume_bars_barchart_{filename_end}.csv', index=False)


In [15]:
dollar_bars = []
DOLLAR_THRESHOLD = 1000000  # Example threshold for dollar bars
# Initialize variables for aggregation
current_open = None
current_high = -float('inf')
current_low = float('inf')
current_close = None
current_dollar_volume = 0
current_symbol = None
current_time = None

# Iterate through rows to build dollar bars
for index, row in data.iterrows():
    if current_open is None:
        current_open = row['Open']
        current_symbol = row['Symbol']
        current_time = row['Date']  # Capture the current time
    current_high = max(current_high, row['High'])
    current_low = min(current_low, row['Low'])
    current_close = row['Close']
    current_dollar_volume += row['Close'] * row['Volume']

    if current_dollar_volume >= DOLLAR_THRESHOLD:
        dollar_bars.append({
            'Symbol': current_symbol,
            'Open': current_open,
            'High': current_high,
            'Low': current_low,
            'Close': current_close,
            'DollarVolume': current_dollar_volume,
            'Date': current_time  # Add time to the dollar bar
        })
        current_open = None
        current_high = -float('inf')
        current_low = float('inf')
        current_close = None
        current_dollar_volume = 0
        current_symbol = None
        current_time = None

dollar_bars_df = pd.DataFrame(dollar_bars)
dollar_bars_df['Date'] = pd.to_datetime(dollar_bars_df['Date'])
dollar_bars_df.to_csv(f'tick_data/bars/dollar_bars_barchart_{filename_end}.csv', index=False)

In [16]:
dollar_bars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1390197 entries, 0 to 1390196
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype         
---  ------        --------------    -----         
 0   Symbol        1390197 non-null  object        
 1   Open          1390197 non-null  float64       
 2   High          1390197 non-null  float64       
 3   Low           1390197 non-null  float64       
 4   Close         1390197 non-null  float64       
 5   DollarVolume  1390197 non-null  float64       
 6   Date          1390197 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 74.2+ MB
