# Alpaca 1-min & 1-day data import & processing
*Note: I do not intend to use Alpaca data, as it only has IEX data. Also it has no delisted stocks etc. This is an exercise for myself such that I can easily manipulate data later on when I have a better data subscription.*

This notebook downloads 1-minute and 1-day OHLC data from the Alpaca API and converts them to my preferred format.

My public and private key are in ../data/alpaca/data/secret.txt

First, this notebook downloads to the ../data/alpaca/data/raw/m1 that will contain the raw csv files (in UTC time).

Second, it processes the raw data to ../data/alpaca/data/processed/m1/bars which will contain the columns <code>["open", "high", "low", "close", "volume", "tradeable"]</code> in naive ET time with no missing minutes. All data is adjusted for everything (splits/dividends). The column "tradeable" is False if data was forward filled. If data is forward filled it means that at that minute there were zero trades. I will just assume that there was a halt or that there was no liquidity. In both cases we cannot trade. In live trading we should check whether the stock is tradable or liquid.

Third, it downloads 1-day adjusted data to ../data/alpaca/raw/d1/adjusted and 1-day unadjusted data to ../data/alpaca/raw/d1/unadjusted.

Fourth, it calculates the adjustment factor and then adds two columns to ../data/alpaca/data/processed/m1/bars, such that all columns are then <code>["open", "high", "low", "close", "close_original", "volume", "tradeable", "adjustment"]</code>.

*Note: a data point at 15:59 with OHLC means that the open was at 15:59:00 and close at 16:00:00. So the data does not contain a point for 16:00 if we only want market opening hours.*

In [15]:
from alpaca.data import StockHistoricalDataClient
from alpaca.data.requests import StockBarsRequest
from alpaca.data.timeframe import TimeFrame, TimeFrameUnit
from alpaca.data.enums import Adjustment

from datetime import datetime, time
from pytz import timezone
import pandas as pd
import numpy as np

**Step 1: Download m1 data and save.**

In [16]:
#SYMBOL_LIST = ["APE", "AMC", "TOP", "AAPL", "O", "SPY", "DEM"]
SYMBOL_LIST =["O"]
START_DATE = datetime(2022, 1, 1, 4)  # in ET; datetime(2015, 12, 1) is the first available date from Alpaca
END_DATE = datetime(2023, 7, 21, 20) # in ET; the last available for free subscriptions is current time minus 15 minutes

In [17]:
with open("../../data/alpaca/secret.txt") as f:
    PUBLIC_KEY = next(f).strip()
    PRIVATE_KEY = next(f).strip()

stock_client = StockHistoricalDataClient(PUBLIC_KEY, PRIVATE_KEY)

for stock in SYMBOL_LIST:
    stock_request = StockBarsRequest(
        symbol_or_symbols=stock,
        start=timezone("US/Eastern").localize(START_DATE),
        end=timezone("US/Eastern").localize(END_DATE),
        timeframe=TimeFrame(1, TimeFrameUnit.Minute),
        adjustment=Adjustment.ALL,
    )
    bars = stock_client.get_stock_bars(stock_request).df

    stock_df = bars.loc[stock][["open", "high", "low", "close", "volume"]]
    stock_df.index.names = ["datetime"]
    stock_df.to_csv(f"../../data/alpaca/raw/m1/{stock}.csv")
    print(f"{datetime.utcnow().replace(microsecond=0)} | Downloaded {stock} m1 data")

In [None]:
example = pd.read_csv(
    f"../../data/alpaca/raw/m1/SPY.csv",
    index_col="datetime",
    parse_dates=True,
    nrows=5
)
example.head()

Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-03 05:01:00+00:00,465.67,465.67,465.67,465.67,4500.0
2022-01-03 09:00:00+00:00,465.22,465.74,465.22,465.5,7790.0
2022-01-03 09:01:00+00:00,465.5,465.5,465.5,465.5,116.0
2022-01-03 09:02:00+00:00,465.46,465.46,465.43,465.43,410.0
2022-01-03 09:03:00+00:00,465.41,465.41,465.31,465.39,708.0


**Step 2: Load raw data and process.**


In [None]:
#SYMBOL_LIST = ["APE", "AMC", "TOP", "AAPL", "O", "SPY", "VYM"]
MARKET_HOURS_ONLY = True # If true, then only select the 1-minute bars when the market is open from 9:30 to 16:00

* Problem: Some stocks do not have trades for all minutes the market is open. Especially since we source from IEX.
* Solution: Get the market opening days from SPY. Then create a list of all timestamps the market is open (=SPY is open). Then reindex minute data using this list.

In [None]:
all_minutes = []
SPY_df = pd.read_csv(
    f"../../data/alpaca/raw/m1/SPY.csv", # We could also just use daily data if we have already downloaded that. 
    index_col="datetime",
    parse_dates=True,
)
print("Loaded SPY")

# Extended US market hours are from 4:00 to 20:00. If it is in UTC, this goes from 10:00 to 02:00 depending on DST
# and will thus span two days. To avoid this, we need to convert the data to ET time.
SPY_df.set_index(SPY_df.index.tz_convert("US/Eastern"), inplace=True) # UTC -> ET
SPY_df.set_index(SPY_df.index.tz_localize(None), inplace=True) # ET -> naive
dates = np.unique(pd.to_datetime(SPY_df.index).date)
amount_of_days = len(dates)
for date in dates:
    for hour in range(4, 20):
        for minute in range(0, 60):
            all_minutes.append(datetime.combine(date, time(hour=hour, minute=minute)))
all_minutes = np.array(all_minutes)

assert len(all_minutes) == amount_of_days * 16 * 60

print(all_minutes)

Loaded SPY
[datetime.datetime(2022, 1, 3, 4, 0) datetime.datetime(2022, 1, 3, 4, 1)
 datetime.datetime(2022, 1, 3, 4, 2) ...
 datetime.datetime(2023, 7, 21, 19, 57)
 datetime.datetime(2023, 7, 21, 19, 58)
 datetime.datetime(2023, 7, 21, 19, 59)]


In [None]:
for stock in SYMBOL_LIST:
    stock_df = pd.read_csv(
        f"../../data/alpaca/raw/m1/{stock}.csv",
        index_col="datetime",
        parse_dates=True,
    )
    stock_df["tradeable"] = True

    stock_df.set_index(stock_df.index.tz_convert("US/Eastern"), inplace=True)
    stock_df.set_index(stock_df.index.tz_localize(None), inplace=True)

    # For some stocks we do not have available data for the entirely of the SPY days. 
    # Hence we need to shrink all_minutes to the dates of the corresponding stock.
    start_datetime = stock_df.index[0]
    end_datetime = stock_df.index[-1]

    stock_minutes = all_minutes[
        (all_minutes >= start_datetime.replace(hour=4, minute=0, second=0))
        & (all_minutes <= end_datetime.replace(hour=19, minute=59, second=0))
    ]

    stock_df = stock_df.reindex(stock_minutes)

    # Fill empty values with last available price, which is the last close.
    stock_df["tradeable"].fillna(False, inplace=True)
    stock_df["volume"].fillna(0, inplace=True)
    stock_df["close"] = stock_df["close"].fillna(method="ffill")

    stock_df["open"] = stock_df["open"].fillna(stock_df["close"])
    stock_df["low"] = stock_df["low"].fillna(stock_df["close"])
    stock_df["high"] = stock_df["high"].fillna(stock_df["close"])

    # Only affects the very start. Else backfill shouldn't be used because of look-ahead bias.
    stock_df["open"] = stock_df["open"].fillna(method="bfill")

    stock_df["close"] = stock_df["close"].fillna(stock_df["open"])
    stock_df["low"] = stock_df["low"].fillna(stock_df["open"])
    stock_df["high"] = stock_df["high"].fillna(stock_df["open"])

    if MARKET_HOURS_ONLY == True:
        stock_df = stock_df.between_time("9:30", "15:59")

    stock_df.to_csv(f"../../data/alpaca/processed/m1/bars/{stock}.csv")
    print(f"Processed {stock}")

Processed AMC


In [None]:
example = pd.read_csv(
    f"../../data/alpaca/processed/m1/bars/APE.csv",
    index_col="datetime",
    parse_dates=True,
    nrows=5
)
example.head(5)

Unnamed: 0_level_0,open,high,low,close,close_original,volume,tradeable,adjustment
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-08-22 09:30:00,6.95,6.95,6.95,6.95,6.95,0.0,False,1.0
2022-08-22 09:31:00,6.95,6.98,6.93,6.9498,6.9498,341847.0,True,1.0
2022-08-22 09:32:00,6.95,7.3,6.93,7.3,7.3,111179.0,True,1.0
2022-08-22 09:33:00,7.3,7.3,7.3,7.3,7.3,0.0,False,1.0
2022-08-22 09:34:00,7.3,7.3,7.3,7.3,7.3,0.0,False,1.0


**Step 3: Download adjusted and unadjusted daily data.**

In [None]:
# SYMBOL_LIST = ["O"]
# START_DATE = datetime(2022, 1, 1)  # in ET; datetime(2015, 12, 1) is the first available date from Alpaca
# END_DATE = datetime(2023, 7, 21) # in ET; the last available for free subscriptions is current time minus 15 minutes

In [None]:
with open("../../data/alpaca/secret.txt") as f:
    PUBLIC_KEY = next(f).strip()
    PRIVATE_KEY = next(f).strip()

stock_client = StockHistoricalDataClient(PUBLIC_KEY, PRIVATE_KEY)

for stock in SYMBOL_LIST:
    stock_request = StockBarsRequest(
        symbol_or_symbols=stock,
        start=timezone("US/Eastern").localize(START_DATE.replace(hour=0)),
        end=timezone("US/Eastern").localize(END_DATE.replace(hour=0)),
        timeframe=TimeFrame(1, TimeFrameUnit.Day),
        adjustment=Adjustment.RAW,
    )
    bars = stock_client.get_stock_bars(stock_request).df

    stock_df = bars.loc[stock][["close", "volume"]]
    stock_df.index.names = ["datetime"]
    stock_df.to_csv(f"../../data/alpaca/raw/d1/unadjusted/{stock}.csv")
    print(f"Downloaded {stock} adjusted")

    stock_request = StockBarsRequest(
        symbol_or_symbols=stock,
        start=timezone("US/Eastern").localize(START_DATE.replace(hour=0)),
        end=timezone("US/Eastern").localize(END_DATE.replace(hour=0)),
        timeframe=TimeFrame(1, TimeFrameUnit.Day),
        adjustment=Adjustment.ALL,
    )
    bars = stock_client.get_stock_bars(stock_request).df

    stock_df = bars.loc[stock][["close", "volume"]]
    stock_df.index.names = ["datetime"]
    stock_df.to_csv(f"../../data/alpaca/raw/d1/adjusted/{stock}.csv")
    print(f"Downloaded {stock} unadjusted")

Downloaded O adjusted
Downloaded O unadjusted


**Step 4: Add columns "close_original" and "adjustment" to m1 data.**

In [None]:
# SYMBOL_LIST = ["APE", "AMC", "TOP", "AAPL", "O", "SPY", "VYM"]

In [None]:
for stock in SYMBOL_LIST:
    stock_df_unadjusted = pd.read_csv(
        f"../../data/alpaca/raw/d1/unadjusted/{stock}.csv",
        index_col="datetime",
        parse_dates=True,
    )
    stock_df_adjusted = pd.read_csv(
        f"../../data/alpaca/raw/d1/adjusted/{stock}.csv",
        index_col="datetime",
        parse_dates=True,
    )
    if not stock_df_adjusted.index.equals(stock_df_unadjusted.index):
        raise Exception(
            "The indices in the adjusted and unadjusted DataFrames are not equal."
        )
    
    # Get adjustment factors per day
    # The unadjusted_close * adjustment = adjusted_close
    adjustment = stock_df_adjusted / stock_df_unadjusted
    adjustment.index = adjustment.index.date
    adjustment.rename(columns={"close": "adjustment"}, inplace=True)

    stock_df_processed = pd.read_csv(
        f"../../data/alpaca/processed/m1/bars/{stock}.csv",
        index_col="datetime",
        parse_dates=True,
    )

    amount_of_days = len(adjustment.index)
    days_in_processed = np.unique(pd.to_datetime(stock_df_processed.index).date)
    amount_of_days_processed = len(days_in_processed)

    if amount_of_days != amount_of_days_processed:
        print(
            f"{stock} | WARNING: The difference between the adjustment days and the processed days are:"
        )
        print(f"In minute but not in days: {np.setdiff1d(days_in_processed, adjustment.index)}")
        print(f"In days but not in minutes: {np.setdiff1d(adjustment.index, days_in_processed)}")

    stock_df_processed["temp_date"] = stock_df_processed.index.date # Create date column in m1 data
    stock_df_with_adj = pd.merge(
        left=stock_df_processed,
        right=adjustment[["adjustment"]],
        how="left",
        left_on="temp_date",
        right_index=True,
    )
    if stock_df_with_adj.isnull().any().any() == True:
        print(
            f"{stock} | WARNING: dataframe contain null values for adjustments. Will do a forward fill and then a backward fill."
        )
        stock_df_with_adj["adjustment"] = (
            stock_df_with_adj["adjustment"]
            .fillna(method="ffill")
            .fillna(method="bfill")
        )
    stock_df_with_adj["close_original"] = (
        stock_df_with_adj["close"] / stock_df_with_adj["adjustment"]
    )
    # Reorder columns
    stock_df_with_adj = stock_df_with_adj[
        [
            "open",
            "high",
            "low",
            "close",
            "close_original",
            "volume",
            "tradeable",
            "adjustment",
        ]
    ]
    stock_df_with_adj.to_csv(f"../../data/alpaca/processed/m1/bars/{stock}.csv")
    print(f"{stock} adjustment added")

AMC adjustment added


In [None]:
example = pd.read_csv(
    f"../../data/alpaca/processed/m1/bars/O.csv",
    index_col="datetime",
    parse_dates=True,
    nrows=5
)
example.head(5)

Unnamed: 0_level_0,open,high,low,close,close_original,volume,tradeable,adjustment
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-01-03 09:30:00,66.98,67.03,66.92,66.94,71.714234,66907.0,True,0.933427
2022-01-03 09:31:00,66.93,66.93,66.83,66.85,71.617815,8665.0,True,0.933427
2022-01-03 09:32:00,66.84,66.9,66.81,66.84,71.607102,7422.0,True,0.933427
2022-01-03 09:33:00,66.83,66.87,66.79,66.83,71.596389,5417.0,True,0.933427
2022-01-03 09:34:00,66.79,66.79,66.7,66.7,71.457117,9107.0,True,0.933427
