## Step 1: Import Libraries

In [1]:
from datetime import datetime
import polars as pl
import pandas as pd
import pyarrow as pa
from typing import List
import duckdb
import yfinance as yf

## Step 2: Import Symbols

In [None]:
def load_symbols(file_path: str) -> List[str]:
    """Load symbols from a text file"""
    try:
        with open(file_path, 'r') as f:
            symbols = [line.strip() for line in f if line.strip()]
        print(f"Loaded {len(symbols)} symbols from {file_path}")
        return symbols
    except Exception as e:
        print(f"Error loading symbols: {str(e)}")
        return []

symbols_file = '../tickers.txt'
symbols = load_symbols(symbols_file)

if symbols:
    print("Symbols:", symbols)
else:
    print("No symbols loaded.")

## Step 3: Extract Data from Yfinance into Pandas

In [None]:
start_date = '2020-01-01'
end_date = datetime.today().strftime('%Y-%m-%d')

# Download prices from yfinance
prices = yf.download(symbols, start=start_date, end=end_date, group_by='ticker')

# Check if the data has a MultiIndex (due to multiple symbols)
if isinstance(prices.columns, pd.MultiIndex):
    # Flatten the MultiIndex columns
    prices.columns = ['_'.join(filter(None, map(str, col))) for col in prices.columns]

# Reset index and melt the DataFrame to include a "symbol" column
prices = prices.copy()  # Avoid fragmentation issues
prices.reset_index(inplace=True)
prices = prices.melt(id_vars=["Date"], var_name="Metric", value_name="Value")
prices[["Symbol", "Metric"]] = prices["Metric"].str.extract(r'([^_]+)_(.+)')
prices = prices.pivot(index=["Date", "Symbol"], columns="Metric", values="Value").reset_index()

display(prices)

## Step 4: Convert Pandas to Polars

In [None]:
# Convert to Polars DataFrame
df = pl.from_pandas(prices)

# View the Polars DataFrame
print(df)

## Step 5: Write Polars to Parquet

In [5]:
output_dir = "../../../data/finance"

# Write DataFrame to Parquet
df.write_parquet(f'{output_dir}/historical_stock_quotes_{start_date}_to_{end_date}.parquet')

## Step 6: Read Parquet (Validate)

In [None]:
pl.scan_parquet(f'{output_dir}/historical_stock_quotes_{start_date}_to_{end_date}.parquet').head().collect()