# Data update

If we want to update data, we should use append the data sets instead of downloading everything again. However if there are splits/dividends in the meantime, we should readjust all historical data. From Alpaca and most other vendors we can download adjusted and unadjusted daily data. So using that we calculate the adjustment factor. This is probably a crude way, but then I am sure that the adjustments are correct. I could also use the dividend/split endpoint, but that would overcomplicate things.

For example lets assume that we have (already adjusted) data from day 1 to day 10. After close on day 15 we want to update the historical data. Lets assume that between and including day 11 to day 15 there was a 1 to 2 stock split. Then the day 10 adjustment factor will be 0.5x. Then all data from day 1 to day 10 should be multiplied by 0.5x. The data from day 11 to 15 is simply appended given that it is already adjusted. 

We will mostly just follow the same steps as in <code>bars.ipynb</code> and <code>tick.ipynb</code>, but instead of downloading everything we simply append. We also have to do some check to see if the dates make sense.

In [47]:
from alpaca.data import StockHistoricalDataClient
from alpaca.data.requests import StockBarsRequest
from alpaca.data.timeframe import TimeFrame, TimeFrameUnit
from alpaca.data.enums import Adjustment

from datetime import datetime, time, timedelta
from pytz import timezone
import pandas as pd
import numpy as np

In [48]:
UPDATE_TO = datetime(2023, 7, 28) #ET time
SYMBOL_LIST = ["SPY"]
MARKET_HOURS_ONLY = True #If True, then the processed m1 data only contains market hours.

**Step 1: Download m1 data and append.**

In [49]:
# First get a list of trading dates using SPY. We will download all available SPY daily data in case we do not have it.
with open("../../data/alpaca/secret.txt") as f:
    PUBLIC_KEY = next(f).strip()
    PRIVATE_KEY = next(f).strip()

stock_client = StockHistoricalDataClient(PUBLIC_KEY, PRIVATE_KEY)
spy_request = StockBarsRequest(
    symbol_or_symbols="SPY",
    start=timezone("US/Eastern").localize(datetime(2015, 12, 1)),
    end=timezone("US/Eastern").localize(UPDATE_TO),
    timeframe=TimeFrame(1, TimeFrameUnit.Day),
    adjustment=Adjustment.RAW,
)
bars = stock_client.get_stock_bars(spy_request).df

spy_df = bars.loc["SPY"][["close", "volume"]]
spy_df.index.names = ["datetime"]
spy_df.to_csv(f"../../data/alpaca/raw/d1/unadjusted/SPY.csv")
print(f"Downloaded SPY adjusted")

spy_request = StockBarsRequest(
    symbol_or_symbols="SPY",
    start=timezone("US/Eastern").localize(datetime(2015, 12, 1)),
    end=timezone("US/Eastern").localize(UPDATE_TO),
    timeframe=TimeFrame(1, TimeFrameUnit.Day),
    adjustment=Adjustment.ALL,
)
bars = stock_client.get_stock_bars(spy_request).df

spy_df = bars.loc["SPY"][["close"]]
spy_df.index.names = ["datetime"]
spy_df.to_csv(f"../../data/alpaca/raw/d1/adjusted/SPY.csv")
print(f"Downloaded SPY unadjusted")

# Retrieve downloaded data
SPY_df = pd.read_csv(
        f"../../data/alpaca/raw/d1/adjusted/SPY.csv",
        index_col="datetime",
        parse_dates=True,
    )
SPY_df.set_index(SPY_df.index.tz_localize(None), inplace=True)
all_trading_dates = pd.to_datetime(SPY_df.index).date

# Also get a list of all minutes (note: all last minute are hence 19:59)
all_minutes = []
amount_of_days = len(all_trading_dates)
for date in all_trading_dates:
    for hour in range(4, 20):
        for minute in range(0, 60):
            all_minutes.append(datetime.combine(date, time(hour=hour, minute=minute)))
all_minutes = np.array(all_minutes)

assert len(all_minutes) == amount_of_days * 16 * 60

Downloaded SPY adjusted
Downloaded SPY unadjusted


In [50]:
stock = "AMC"

# We want to append only the new bars. So we need the start date of the update. 
bars_old = pd.read_csv(
    f"../../data/alpaca/raw/m1/{stock}.csv",
    index_col="datetime",
    parse_dates=True,
)
last_available_date = bars_old.index[-1].date()
trading_dates_to_update = all_trading_dates[(all_trading_dates >= last_available_date + timedelta(days=1)) & (all_trading_dates <= UPDATE_TO.date())]

# Get new bars
stock_request = StockBarsRequest(
    symbol_or_symbols=stock,
    start=timezone("US/Eastern").localize(datetime.combine(trading_dates_to_update[0], time(hour=4))),
    end=timezone("US/Eastern").localize(datetime.combine(trading_dates_to_update[-1], time(hour=20))),
    timeframe=TimeFrame(1, TimeFrameUnit.Minute),
    adjustment=Adjustment.ALL,
)
print(f"{datetime.utcnow().replace(microsecond=0)} | Download {stock} m1 raw data from {trading_dates_to_update[0].strftime('%Y-%m-%d')} to {trading_dates_to_update[-1].strftime('%Y-%m-%d')}")

bars_new = stock_client.get_stock_bars(stock_request).df

bars_new = bars_new.loc[stock][["open", "high", "low", "close", "volume"]]
bars_new.index.names = ["datetime"]

# Combine old and new bars
bars_all = pd.concat([bars_old, bars_new])

if len(bars_all.index[bars_all.index.duplicated()]) != 0:
    raise Exception('When merging old and new bars, there were duplicate dates.')

#bars_all.to_csv(f"../../data/alpaca/raw/m1/{stock}.csv")
print(f"{datetime.utcnow().replace(microsecond=0)} | Updated {stock} m1 raw data")

# 2. UPDATE PROCESSED BARS
# Processed bars always has the same as raw bars because in bars.ipynb we processed all raw data.
# So we do not have to worry about dates not being equal
bars_processed_new = bars_new.copy() 
bars_processed_new["tradeable"] = True

bars_processed_new.set_index(bars_processed_new.index.tz_convert("US/Eastern"), inplace=True)
bars_processed_new.set_index(bars_processed_new.index.tz_localize(None), inplace=True)

start_datetime = bars_processed_new.index[0]
end_datetime = bars_processed_new.index[-1]

stock_minutes = all_minutes[
    (all_minutes >= start_datetime.replace(hour=4, minute=0, second=0))
    & (all_minutes <= end_datetime.replace(hour=19, minute=59, second=0))
]

bars_processed_new = bars_processed_new.reindex(stock_minutes)

# Get old processed bars
bars_processed_old = pd.read_csv(
    f"../../data/alpaca/processed/m1/bars/{stock}.csv",
    index_col="datetime",
    parse_dates=True,
)

# Combine old + new
bars_processed_all = pd.concat([bars_processed_old, bars_processed_new])

# Fill empty values with last available price, which is the last close.
bars_processed_all["tradeable"].fillna(False, inplace=True)
bars_processed_all["volume"].fillna(0, inplace=True)
bars_processed_all["close"] = bars_processed_all["close"].fillna(method="ffill")

bars_processed_all["open"] = bars_processed_all["open"].fillna(bars_processed_all["close"])
bars_processed_all["low"] = bars_processed_all["low"].fillna(bars_processed_all["close"])
bars_processed_all["high"] = bars_processed_all["high"].fillna(bars_processed_all["close"])

# In contrast to the initial download, we do not have to backfill. That is also why we
# forward filled after combining. If we did it before, there could still be empty values
# for the first few values of the updated bars. Backfilling them would be wrong because
# we have old data to forward fill with.

if MARKET_HOURS_ONLY == True:
    bars_processed_all = bars_processed_all.between_time("9:30", "15:59")

if len(bars_processed_all.index[bars_processed_all.index.duplicated()]) != 0:
    raise Exception('When merging old and new bars, there were duplicate dates.')

bars_processed_all.to_csv(f"../../data/alpaca/processed/m1/bars/{stock}.csv")




2023-07-31 13:24:49 | Download AMC m1 raw data from 2023-07-24 to 2023-07-28
2023-07-31 13:24:50 | Updated AMC m1 raw data
