In [3]:
import yfinance as yf
import pandas as pd

# Configuration
start_date = "2016-01-01"
end_date = "2025-12-31"
file_name = "market_data_raw.csv"

# Define tickers
tickers = {
    "BTC-USD": "BTC",
    "GC=F": "Gold",
    "^GSPC": "SP500",
    "DX-Y.NYB": "DXY",
    "^TNX": "Yield_10Y",
    "^VIX": "VIX"
}

# Extraction
market_data = yf.download(
    list(tickers.keys()), 
    start=start_date, 
    end=end_date,
    auto_adjust=False,
    multi_level_index=False
)

# If 'Adj Close' is available, take it. If not, fall back to 'Close'.
if 'Adj Close' in market_data.columns:
    market_data = market_data['Adj Close']
else:
    market_data = market_data['Close']

# Rename
market_data.rename(columns=tickers, inplace=True)

# Save to CSV for cleaning
market_data.to_csv(file_name)

# Confirmation
print(f"Market data saved to {file_name}")

[*********************100%***********************]  6 of 6 completed

Market data saved to market_data_raw.csv





In [None]:
# Convert market_data_raw.csv into dataframe
file_path = r"c:\Users\rober\Documents\Python\rg_project\data\raw\market_data_raw.csv"
df_market_raw = pd.read_csv(file_path, index_col='Date', parse_dates=True)

print(df_market_raw.head())

In [None]:
df_market_raw.shape 
# (3652, 6)

df_market_raw.dtypes 
# BTC          float64
# DXY          float64
# Gold         float64
# SP500        float64
# Yield_10Y    float64
# VIX          float6

df_market_raw.isna().sum()
# BTC             0 -- BTC trades 24/7
# DXY          1138
# Gold         1140
# SP500        1139
# Yield_10Y    1140
# VIX          1139

# Market Synced version (dropping weekends)
df_market_synced = df_market_raw.dropna(subset=['SP500', 'Gold', 'DXY', 'VIX', 'Yield_10Y'])
  
# Market Filled (forward/backward filling NaN with previous value) 
df_market_filled = df_market_raw.ffill().bfill()

# Checking
print(df_market_synced.head())
print(df_market_filled.head())

In [None]:
# Creating 'clean' versions
df_market_synced.to_csv("market_data_synced_clean.csv")
df_market_filled.to_csv("market_data_filled_clean.csv")

In [None]:
# Convert csv into dataframe
file_path = r"c:\Users\rober\Documents\Python\rg_project\data\processed\market_data_filled_clean.csv"
df_market_filled_clean = pd.read_csv(file_path, index_col='Date', parse_dates=True)

print(df_market_filled_clean.head())

In [None]:
# Creating columns for price normalization
# Using Base-100: (Current Price / Starting Price) x 100
# VIX remains as an absolute level

for col in ['BTC', 'DXY', 'Gold', 'SP500', 'Yield_10Y']:
    first_price = df_market_filled_clean[col].iloc[0]
    df_market_filled_clean[f'{col}_norm'] = (df_market_filled_clean[col] / first_price) * 100

# Checking results
print(df_market_filled_clean.head())

In [None]:
# Creating new % change columns for correlation
# Yield_10Y is a % so using first differencing

# 'BTC', 'DXY', 'Gold', 'SP500' % change columns
for col in ['BTC', 'DXY', 'Gold', 'SP500']:
    df_market_filled_clean[f'{col}_pct_chg'] = df_market_filled_clean[col].pct_change()

# Yield % change column
df_market_filled_clean['Yield_10Y_pct_chg'] = df_market_filled_clean['Yield_10Y'].diff()

# Clean up the first row
df_market_filled_clean.fillna(0, inplace=True)

# Checking results
print(df_market_filled_clean.head())

In [None]:
# Saving new market_data_filled_final.csv
df_market_filled_clean.to_csv("market_data_filled_final.csv")

In [None]:
# Convert csv into dataframe
file_path = r"c:\Users\rober\Documents\Python\rg_project\data\processed\market_data_synced_clean.csv"
df_market_synced_clean = pd.read_csv(file_path, index_col='Date', parse_dates=True)

print(df_market_synced_clean.head())

In [None]:
# Creating columns for price normalization
# Using Base-100: (Current Price / Starting Price) x 100
# VIX remains as an absolute level

for col in ['BTC', 'DXY', 'Gold', 'SP500', 'Yield_10Y']:
    first_price = df_market_synced_clean[col].iloc[0]
    df_market_synced_clean[f'{col}_norm'] = (df_market_synced_clean[col] / first_price) * 100

# Checking results
print(df_market_synced_clean.head())

In [None]:
# Creating new % change columns for correlation
# Yield_10Y is a % so using first differencing

# 'BTC', 'DXY', 'Gold', 'SP500' % change columns
for col in ['BTC', 'DXY', 'Gold', 'SP500']:
    df_market_synced_clean[f'{col}_pct_chg'] = df_market_synced_clean[col].pct_change()

# Yield % change column
df_market_synced_clean['Yield_10Y_pct_chg'] = df_market_synced_clean['Yield_10Y'].diff()

# Clean up the first row
df_market_synced_clean.fillna(0, inplace=True)

# Checking results
print(df_market_synced_clean.head())

In [None]:
# Saving new market_data_sycned_final.csv
df_market_synced_clean.to_csv("market_data_sycned_final.csv")