In [53]:
import pandas as pd
import sys
import os
import numpy as np
import yfinance as yf
import torch.nn as nn
from sklearn.model_selection import train_test_split

In [65]:
context_tickers = {
    '30_year_bonds': 'TLT',
    '10_year_bonds': 'IEF',
    '5_year_bonds': 'IEI',
    '2_year_bonds': 'SHY',
    '1_year_bonds': 'SHV',
    'gold': 'GLD',
    'silver': 'SLV',
    # 'copper': 'CPER', # starts in 2011
    'oil': 'USO',
    'natural_gas': 'UNG',
    'sp500': 'SPY',
    'nasdaq': 'QQQ',
    'dow_jones': 'DIA',
    'russell_2000': 'IWM',
    'us_dollar': 'UUP',
    'emerging_markets': 'EEM',
    'euro': 'FXE',
    'british_pound': 'FXB',
    'japanese_yen': 'FXY',
    # 'bitcoin': 'BTC-USD', # starts in 2014
    # 'ethereum': 'ETH-USD', # start in 2014
}

target_tickers = {
    'coinbase': 'COIN',
    'robinhood': 'HOOD',
    'amazon': 'AMZN',
    'apple': 'AAPL',
    'google': 'GOOGL',
    'microsoft': 'MSFT',
    'tesla': 'TSLA',
    'meta': 'META',
    'nvidia': 'NVDA',
    'general_motors': 'GM',
    'ford': 'F',
    'crowdstrike': 'CRWD',
    'palantir': 'PLTR',
}

In [66]:
# Relative Strength Index (RSI)
def compute_rsi(df, column='Close', window=14):
    delta = df[column].diff(1)
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window).mean()
    avg_loss = loss.rolling(window).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    df['RSI'] = rsi
    return df

# Exponential Moving Average (EMA)
def compute_ema(df, column='Close', windows=[12, 26]):
    for window in windows:
        df[f'EMA_{window}'] = df[column].ewm(span=window, adjust=False).mean()
    return df

# Moving Average Convergence Divergence (MACD)
def compute_macd(df, column='Close', short_window=12, long_window=26, signal_window=9):
    short_ema = df[column].ewm(span=short_window, adjust=False).mean()
    long_ema = df[column].ewm(span=long_window, adjust=False).mean()
    macd = short_ema - long_ema
    signal = macd.ewm(span=signal_window, adjust=False).mean()
    df['MACD'] = macd
    df['Signal'] = signal
    return df

# Bollinger Bands
def compute_bollinger_bands(df, column='Close', window=20):
    ma = df[column].rolling(window).mean()
    std = df[column].rolling(window).std()
    df['BB_upper'] = ma + 2 * std
    df['BB_lower'] = ma - 2 * std
    return df

# On-Balance Volume (OBV)
def compute_obv(df, column='Close', volume_column='Volume'):
    obv = (np.sign(df[column].diff()) * df[volume_column]).fillna(0).cumsum()
    df['OBV'] = obv
    return df

# Average True Range (ATR)
def compute_atr(df, high_col='High', low_col='Low', close_col='Close', window=14):
    high_low = df[high_col] - df[low_col]
    high_close = np.abs(df[high_col] - df[close_col].shift(1))
    low_close = np.abs(df[low_col] - df[close_col].shift(1))
    true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    atr = true_range.rolling(window).mean()
    df['ATR'] = atr
    return df


In [67]:
def enrich_with_technical_indicators(df):
    ticker = df.columns[0][1]
    df = compute_ema(df)
    df = compute_macd(df)
    df = compute_bollinger_bands(df)
    df = compute_obv(df)
    df = compute_atr(df)
    df = df.dropna()
    df.columns = [f"{ticker}_{col[0]}" for col in df.columns]

    return df

test = enrich_with_technical_indicators(yf.download('AAPL', start='2020-01-01', end='2023-01-01'))
print(test.columns)
display(test.tail(2))      

  test = enrich_with_technical_indicators(yf.download('AAPL', start='2020-01-01', end='2023-01-01'))
[*********************100%***********************]  1 of 1 completed

Index(['AAPL_Close', 'AAPL_High', 'AAPL_Low', 'AAPL_Open', 'AAPL_Volume',
       'AAPL_EMA_12', 'AAPL_EMA_26', 'AAPL_MACD', 'AAPL_Signal',
       'AAPL_BB_upper', 'AAPL_BB_lower', 'AAPL_OBV', 'AAPL_ATR'],
      dtype='object')





Unnamed: 0_level_0,AAPL_Close,AAPL_High,AAPL_Low,AAPL_Open,AAPL_Volume,AAPL_EMA_12,AAPL_EMA_26,AAPL_MACD,AAPL_Signal,AAPL_BB_upper,AAPL_BB_lower,AAPL_OBV,AAPL_ATR
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2022-12-29,127.952591,128.811461,126.096634,126.353304,75703700,131.81959,136.209763,-4.390173,-3.357134,150.134772,122.876757,28777300.0,4.432584
2022-12-30,128.268463,128.288212,125.80044,126.767912,77034200,131.273263,135.621518,-4.348255,-3.555358,148.861379,122.335656,105811500.0,4.280975


In [74]:
# Download historical data for context tickers and apply technical indicators
def download_and_enrich_data(tickers: dict, start='2010-01-01', end='2025-06-15'):
    context_data = {}
    for name, ticker in tickers.items():
        df = yf.download(ticker, start=start, end=end)
        df = enrich_with_technical_indicators(df)
        context_data[name] = df
    return context_data



# Download and enrich data
context_data = download_and_enrich_data(context_tickers)    
context_df = pd.concat(context_data.values(), axis=1).dropna()

# verify timeframes
for ticker, df in context_data.items():
    print(f"{ticker} index range: {df.index.min()} to {df.index.max()}", 'count:', len(df))

print(context_df.describe())
context_df.to_csv('../data/context_data.csv')

  df = yf.download(ticker, start=start, end=end)
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start=start, end=end)
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start=start, end=end)
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start=start, end=end)
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start=start, end=end)
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start=start, end=end)
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start=start, end=end)
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start=start, end=end)
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start=start, end=end)
[******

30_year_bonds index range: 2010-02-01 00:00:00 to 2025-06-13 00:00:00 count: 3867
10_year_bonds index range: 2010-02-01 00:00:00 to 2025-06-13 00:00:00 count: 3867
5_year_bonds index range: 2010-02-01 00:00:00 to 2025-06-13 00:00:00 count: 3867
2_year_bonds index range: 2010-02-01 00:00:00 to 2025-06-13 00:00:00 count: 3867
1_year_bonds index range: 2010-02-01 00:00:00 to 2025-06-13 00:00:00 count: 3867
gold index range: 2010-02-01 00:00:00 to 2025-06-13 00:00:00 count: 3867
silver index range: 2010-02-01 00:00:00 to 2025-06-13 00:00:00 count: 3867
oil index range: 2010-02-01 00:00:00 to 2025-06-13 00:00:00 count: 3867
natural_gas index range: 2010-02-01 00:00:00 to 2025-06-13 00:00:00 count: 3867
sp500 index range: 2010-02-01 00:00:00 to 2025-06-13 00:00:00 count: 3867
nasdaq index range: 2010-02-01 00:00:00 to 2025-06-13 00:00:00 count: 3867
dow_jones index range: 2010-02-01 00:00:00 to 2025-06-13 00:00:00 count: 3867
russell_2000 index range: 2010-02-01 00:00:00 to 2025-06-13 00:00: