In [1]:
import pandas as pd
import sys
import os
import numpy as np
import yfinance as yf
import torch.nn as nn
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option('display.max_columns', None)

import sys
import os
sys.path.append(os.path.abspath("../src"))
from technical_indicators import enrich_with_technical_indicators

# suppress yfinance error messages
import contextlib
@contextlib.contextmanager
def suppress_stdout_stderr():
    with open(os.devnull, 'w') as devnull:
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = devnull
        sys.stderr = devnull
        try:
            yield
        finally:
            sys.stdout = old_stdout
            sys.stderr = old_stderr

In [2]:
market_context_tickers = {
    # Bonds & Rates
    '20_30_year_bonds': 'TLT',
    '7_10_year_bonds': 'IEF',
    '5_year_bonds': 'IEI',
    '2_year_bonds': 'SHY',
    '1_year_bonds': 'SHV',
    'investment_grade_corp_bonds': 'LQD',
    'high_yield_corp_bonds': 'HYG',
    'treasury_inflation_protected': 'TIP',

    # Commodities
    'gold': 'GLD',
    'silver': 'SLV',
    'copper': 'CPER', 
    'oil': 'USO',
    'natural_gas': 'UNG',
    'agriculture': 'DBA',

    # Equities - Major Indexes
    'sp500': 'SPY',
    'nasdaq': 'QQQ',
    'dow_jones': 'DIA',
    'russell_2000': 'IWM',
    'emerging_markets': 'EEM',
    'developed_markets': 'EFA',
    'china': 'FXI',

    # Sectors (SPDR ETFs)
    'financials': 'XLF',
    'technology': 'XLK',
    'energy': 'XLE',
    'consumer_discretionary': 'XLY',
    'consumer_staples': 'XLP',
    'health_care': 'XLV',
    'industrials': 'XLI',
    'materials': 'XLB',
    'real_estate': 'XLRE',
    'utilities': 'XLU',

    # Currencies
    'us_dollar': 'UUP',
    'euro': 'FXE',
    'british_pound': 'FXB',
    'japanese_yen': 'FXY',
    'canadian_dollar': 'FXC',
    'australian_dollar': 'FXA',
    'swiss_franc': 'FXF',

    # Volatility & Risk
    'vix_short_term': 'VIXY',
    'vix_mid_term': 'VXZ',
}

stock_tickers = {
    'costco': 'COST',
    'coinbase': 'COIN',
    'robinhood': 'HOOD',
    'amazon': 'AMZN',
    'apple': 'AAPL',
    'google': 'GOOGL',
    'microsoft': 'MSFT',
    'tesla': 'TSLA',
    'meta': 'META',
    'nvidia': 'NVDA',
    'general_motors': 'GM',
    'ford': 'F',
    'crowdstrike': 'CRWD',
    'palantir': 'PLTR',
}

crypto_tickers = {
    'bitcoin': 'BTC-USD',

    # Keep only ETH-BTC as a relative pair
    'ethereum_bitcoin': 'ETH-BTC',

    # Layer 1s
    'ethereum': 'ETH-USD',
    'solana': 'SOL-USD',
    'ripple': 'XRP-USD',
    'hedera': 'HBAR-USD',
    # DeFi
    'chainlink': 'LINK-USD',
    'uniswap': 'UNI-USD',
    'aave': 'AAVE-USD',
    'raydium': 'RAY-USD',
    'ondo': 'ONDO-USD',
    'morpho': 'MORPH-USD',
    # 'raydium': 'RAY-USD',
    # 'curve': 'CRV-USD',
    # 'aerodrome': 'AERO-USD',
    
    # AI coins
    'fetch_ai': 'FET-USD',
    'graph': 'GRT-USD',
    # 'bittensor': 'TAO-USD',
    # 'grass': 'GRASS-USD',

    # meme coins
    'dogecoin': 'DOGE-USD',
    'shiba_inu': 'SHIB-USD',
    'pepe': 'PEPE-USD',
    'bonk': 'BONK-USD',
}

null_value = np.nan

In [3]:
# Download historical data for context tickers and apply technical indicators
def download_and_enrich_data(tickers: dict, period = 'max', interval='1d', null_value = -99):
    """ Downloads historical data for given tickers and enriches it with technical indicators.  
    Args:
        tickers (dict): Dictionary of ticker names and their corresponding symbols.
        period (str): Period for which to download data (default is 'max').
        interval (str): Data interval (default is '1d').
    Returns:
        dict: Dictionary with ticker names as keys and DataFrames with historical data and indicators as values
    """
    data = {}
    failed_tickers = []
    for name, ticker in tickers.items():
        try:
            with suppress_stdout_stderr():
                df = yf.download(ticker, period=period, interval=interval, progress=False)
            df = enrich_with_technical_indicators(df)
            df.columns = [f"{ticker}_{col[0]}" for col in df.columns]
            df.index = pd.to_datetime(df.index)
            data[name] = df
        except:
            failed_tickers.append((name, ticker))
            continue
        
    if len(failed_tickers) > 0:
        print('Failed tickers:', failed_tickers)


    combined_df = pd.concat(data.values(), axis=1).fillna(null_value)
    
    # add interval_prefix to each column name in context_df and target_df
    combined_df.columns = [f"{interval}_{col}" for col in combined_df.columns]
    
    return combined_df


def append_data_to_previous_download(new_df, data_path, overwrite=False):
    if os.path.exists(data_path) and overwrite is False:
        existing_df = pd.read_csv(data_path, index_col=0, parse_dates=True)
        combined_df = pd.concat([existing_df, new_df], axis=0)
        combined_df = combined_df[~combined_df.index.duplicated(keep='last')]
        combined_df = combined_df.sort_index()
        combined_df = combined_df.dropna(axis=1, how='all')  # Drop columns that are all NaN
        combined_df = combined_df.dropna(axis=0, how='all')  # Drop
    else:
        combined_df = new_df
    
    combined_df.to_csv(data_path, index=True)

    return combined_df


In [4]:
# time_intervals = ['1d', '1h', '15m', '5m', '2m', '1m']

# for interval in time_intervals:
#     market_context_df = download_and_enrich_data(market_context_tickers, period = 'max', interval=interval, null_value = null_value)    
#     stock_df = download_and_enrich_data(stock_tickers, period = 'max', interval=interval, null_value = null_value)    
#     crypto_df = download_and_enrich_data(crypto_tickers, period = 'max', interval=interval, null_value = null_value)    

#     market_context_df = append_data_to_previous_download(market_context_df, f'../data/market_context/{interval}.csv')
#     stock_df = append_data_to_previous_download(stock_df, f'../data/stocks/{interval}.csv')
#     crypto_df = append_data_to_previous_download(crypto_df, f'../data/crypto/{interval}.csv')    

#     print('---------------------')
#     print(f'Interval: {interval}')
#     print('Market context | num rows:', market_context_df.shape[0], 'num days:', market_context_df.index.max() - market_context_df.index.min())
#     print('Stocks | num rows:', stock_df.shape[0], 'num days:', stock_df.index.max() - stock_df.index.min()) 
#     print('Crypto | num rows:', crypto_df.shape[0], 'num days:', crypto_df.index.max() - crypto_df.index.min())
#     print('---------------------\n')
    


In [5]:
simplified_crypto_tickers = {
    'bitcoin': 'BTC-USD',
    'ethereum': 'ETH-USD',
    'solana': 'SOL-USD',
    'ethereum_bitcoin': 'ETH-BTC',

    # market context tickers
    '20_30_year_bonds': 'TLT',
    '7_10_year_bonds': 'IEF',
    '5_year_bonds': 'IEI',
    '2_year_bonds': 'SHY',
    '1_year_bonds': 'SHV',

    'nasdaq': 'QQQ',
    'sp500': 'SPY',
    'dow_jones': 'DIA',
    'emerging_markets': 'EEM',
    'russell_2000': 'IWM',

    'us_dollar': 'UUP',
    'gold': 'GLD',
}

time_intervals = ['1d', '1h', '15m', '5m', '2m', '1m']
for interval in time_intervals:
    df = download_and_enrich_data(simplified_crypto_tickers, period = 'max', interval=interval, null_value = null_value)
    df = append_data_to_previous_download(df, f'../data/crypto/simple_{interval}.csv')    

    print('---------------------')
    print(f'Interval: {interval}')
    print('Simple crypto + context | shape:', df.shape, 'num days:', df.index.max() - df.index.min())
    print('---------------------\n')
    

---------------------
Interval: 1d
Simple crypto + context | shape: (9129, 448) num days: 11556 days 00:00:00
---------------------

---------------------
Interval: 1h
Simple crypto + context | shape: (20612, 448) num days: 721 days 17:00:00
---------------------

---------------------
Interval: 15m
Simple crypto + context | shape: (5562, 448) num days: 57 days 22:15:00
---------------------

---------------------
Interval: 5m
Simple crypto + context | shape: (17076, 448) num days: 59 days 07:25:00
---------------------

---------------------
Interval: 2m
Simple crypto + context | shape: (25924, 448) num days: 47 days 06:52:00
---------------------

---------------------
Interval: 1m
Simple crypto + context | shape: (11179, 448) num days: 7 days 20:32:00
---------------------

