## Step 1: Import Libraries

In [1]:
from datetime import datetime
import polars as pl
import pandas as pd
import pyarrow as pa
from typing import List
import duckdb
import yfinance as yf

## Step 2: Import Symbols

In [2]:
def load_symbols(file_path: str) -> List[str]:
    """Load symbols from a text file"""
    try:
        with open(file_path, 'r') as f:
            symbols = [line.strip() for line in f if line.strip()]
        print(f"Loaded {len(symbols)} symbols from {file_path}")
        return symbols
    except Exception as e:
        print(f"Error loading symbols: {str(e)}")
        return []

symbols_file = '../tickers.txt'
symbols = load_symbols(symbols_file)

if symbols:
    print("Symbols:", symbols)
else:
    print("No symbols loaded.")

Loaded 60 symbols from ../tickers.txt
Symbols: ['AAPL', 'MSFT', 'GOOGL', 'GOOG', 'META', 'NVDA', 'AVGO', 'ORCL', 'CRM', 'ACN', 'ADBE', 'CSCO', 'INTC', 'NFLX', 'DIS', 'CMCSA', 'VZ', 'T', 'AMZN', 'TSLA', 'HD', 'MCD', 'NKE', 'SBUX', 'TGT', 'LOW', 'WMT', 'PG', 'KO', 'PEP', 'COST', 'BRK-B', 'JPM', 'BAC', 'WFC', 'GS', 'MS', 'BLK', 'UNH', 'JNJ', 'PFE', 'ABBV', 'MRK', 'LLY', 'CAT', 'BA', 'HON', 'UPS', 'RTX', 'GE', 'XOM', 'CVX', 'COP', 'SLB', 'LIN', 'APD', 'ECL', 'PLD', 'AMT', 'CCI']


## Step 3: Extract Data from Yfinance into Pandas

In [3]:
start_date = '2020-01-01'
end_date = datetime.today().strftime('%Y-%m-%d')

# Download prices from yfinance
prices = yf.download(symbols, start=start_date, end=end_date, group_by='ticker')

# Check if the data has a MultiIndex (due to multiple symbols)
if isinstance(prices.columns, pd.MultiIndex):
    # Flatten the MultiIndex columns
    prices.columns = ['_'.join(filter(None, map(str, col))) for col in prices.columns]

# Reset index and melt the DataFrame to include a "symbol" column
prices = prices.copy()  # Avoid fragmentation issues
prices.reset_index(inplace=True)
prices = prices.melt(id_vars=["Date"], var_name="Metric", value_name="Value")
prices[["Symbol", "Metric"]] = prices["Metric"].str.extract(r'([^_]+)_(.+)')
prices = prices.pivot(index=["Date", "Symbol"], columns="Metric", values="Value").reset_index()

display(prices)

[                       0%                       ]

[*                      3%                       ]  2 of 60 completed

[*                      3%                       ]  2 of 60 completed

[***                    7%                       ]  4 of 60 completed

[****                   8%                       ]  5 of 60 completed

[*****                 10%                       ]  6 of 60 completed

[******                12%                       ]  7 of 60 completed

[******                13%                       ]  8 of 60 completed

[*******               15%                       ]  9 of 60 completed

[********              17%                       ]  10 of 60 completed

[*********             18%                       ]  11 of 60 completed

[**********            20%                       ]  12 of 60 completed

[***********           22%                       ]  13 of 60 completed

[***********           22%                       ]  13 of 60 completed

[************          25%                       ]  15 of 60 completed

[*************         27%                       ]  16 of 60 completed

[*************         28%                       ]  17 of 60 completed

[**************        30%                       ]  18 of 60 completed

[***************       32%                       ]  19 of 60 completed

[****************      33%                       ]  20 of 60 completed

[*****************     35%                       ]  21 of 60 completed

[******************    37%                       ]  22 of 60 completed

[******************    38%                       ]  23 of 60 completed

[*******************   40%                       ]  24 of 60 completed

[********************  42%                       ]  25 of 60 completed

[********************* 43%                       ]  26 of 60 completed

[**********************45%                       ]  27 of 60 completed

[**********************47%                       ]  28 of 60 completed

[**********************48%                       ]  29 of 60 completed

[**********************50%                       ]  30 of 60 completed

[**********************52%                       ]  31 of 60 completed

[**********************53%                       ]  32 of 60 completed

[**********************55%*                      ]  33 of 60 completed

[**********************57%**                     ]  34 of 60 completed

[**********************58%***                    ]  35 of 60 completed

[**********************60%****                   ]  36 of 60 completed

[**********************62%*****                  ]  37 of 60 completed

[**********************63%*****                  ]  38 of 60 completed

[**********************65%******                 ]  39 of 60 completed

[**********************67%*******                ]  40 of 60 completed

[**********************68%********               ]  41 of 60 completed

[**********************70%*********              ]  42 of 60 completed

[**********************72%**********             ]  43 of 60 completed

[**********************73%**********             ]  44 of 60 completed

[**********************75%***********            ]  45 of 60 completed

[**********************77%************           ]  46 of 60 completed

[**********************78%************           ]  47 of 60 completed

[**********************80%*************          ]  48 of 60 completed

[**********************82%**************         ]  49 of 60 completed

[**********************83%***************        ]  50 of 60 completed

[**********************85%****************       ]  51 of 60 completed

[**********************87%*****************      ]  52 of 60 completed

[**********************88%*****************      ]  53 of 60 completed

[**********************90%******************     ]  54 of 60 completed

[**********************92%*******************    ]  55 of 60 completed

[**********************93%********************   ]  56 of 60 completed

[**********************95%*********************  ]  57 of 60 completed

[**********************97%********************** ]  58 of 60 completed

[**********************98%********************** ]  59 of 60 completed

[*********************100%***********************]  60 of 60 completed




Metric,Date,Symbol,Close,High,Low,Open,Volume
0,2020-01-02,AAPL,72.796005,72.856598,71.545372,71.799858,135480400.0
1,2020-01-02,ABBV,72.264893,72.281029,71.425635,71.885612,5639200.0
2,2020-01-02,ACN,195.263596,196.908218,194.018521,195.923308,2431100.0
3,2020-01-02,ADBE,334.429993,334.480011,329.170013,330.000000,1990100.0
4,2020-01-02,AMT,200.433960,202.758462,200.004141,201.469017,1426000.0
...,...,...,...,...,...,...,...
75595,2025-01-03,UPS,123.790001,124.330002,122.760002,123.959999,3490400.0
75596,2025-01-03,VZ,40.259998,40.630001,40.110001,40.320000,15223400.0
75597,2025-01-03,WFC,71.309998,71.419998,69.739998,70.349998,9152800.0
75598,2025-01-03,WMT,90.779999,91.300003,90.139999,90.150002,10815200.0


## Step 4: Convert Pandas to Polars

In [4]:
# Convert to Polars DataFrame
df = pl.from_pandas(prices)

# View the Polars DataFrame
print(df)

shape: (75_600, 7)
┌─────────────────────┬────────┬────────────┬────────────┬────────────┬────────────┬────────────┐
│ Date                ┆ Symbol ┆ Close      ┆ High       ┆ Low        ┆ Open       ┆ Volume     │
│ ---                 ┆ ---    ┆ ---        ┆ ---        ┆ ---        ┆ ---        ┆ ---        │
│ datetime[ns]        ┆ str    ┆ f64        ┆ f64        ┆ f64        ┆ f64        ┆ f64        │
╞═════════════════════╪════════╪════════════╪════════════╪════════════╪════════════╪════════════╡
│ 2020-01-02 00:00:00 ┆ AAPL   ┆ 72.796005  ┆ 72.856598  ┆ 71.545372  ┆ 71.799858  ┆ 1.354804e8 │
│ 2020-01-02 00:00:00 ┆ ABBV   ┆ 72.264893  ┆ 72.281029  ┆ 71.425635  ┆ 71.885612  ┆ 5.6392e6   │
│ 2020-01-02 00:00:00 ┆ ACN    ┆ 195.263596 ┆ 196.908218 ┆ 194.018521 ┆ 195.923308 ┆ 2.4311e6   │
│ 2020-01-02 00:00:00 ┆ ADBE   ┆ 334.429993 ┆ 334.480011 ┆ 329.170013 ┆ 330.0      ┆ 1.9901e6   │
│ 2020-01-02 00:00:00 ┆ AMT    ┆ 200.43396  ┆ 202.758462 ┆ 200.004141 ┆ 201.469017 ┆ 1.426e6    │
│

## Step 5: Write Polars to Parquet

In [5]:
output_dir = "../../../data/finance"

# Write DataFrame to Parquet
df.write_parquet(f'{output_dir}/historical_stock_quotes_{start_date}_to_{end_date}.parquet')

## Step 6: Read Parquet (Validate)

In [6]:
pl.scan_parquet(f'{output_dir}/historical_stock_quotes_{start_date}_to_{end_date}.parquet').head().collect()

Date,Symbol,Close,High,Low,Open,Volume
datetime[ns],str,f64,f64,f64,f64,f64
2020-01-02 00:00:00,"""AAPL""",72.796005,72.856598,71.545372,71.799858,135480400.0
2020-01-02 00:00:00,"""ABBV""",72.264893,72.281029,71.425635,71.885612,5639200.0
2020-01-02 00:00:00,"""ACN""",195.263596,196.908218,194.018521,195.923308,2431100.0
2020-01-02 00:00:00,"""ADBE""",334.429993,334.480011,329.170013,330.0,1990100.0
2020-01-02 00:00:00,"""AMT""",200.43396,202.758462,200.004141,201.469017,1426000.0
