## Step 1: Import Libraries

In [1]:
from datetime import datetime
import polars as pl
import pandas as pd
import pyarrow as pa
from typing import List
import duckdb
import yfinance as yf

## Step 2: Import Symbols

In [2]:
def load_symbols(file_path: str) -> List[str]:
    """Load symbols from a text file"""
    try:
        with open(file_path, 'r') as f:
            symbols = [line.strip() for line in f if line.strip()]
        print(f"Loaded {len(symbols)} symbols from {file_path}")
        return symbols
    except Exception as e:
        print(f"Error loading symbols: {str(e)}")
        return []

symbols_file = '../tickers.txt'
symbols = load_symbols(symbols_file)

if symbols:
    print("Symbols:", symbols)
else:
    print("No symbols loaded.")

Loaded 61 symbols from ../tickers.txt
Symbols: ['AAPL', 'MSFT', 'GOOGL', 'GOOG', 'META', 'NVDA', 'AVGO', 'ORCL', 'CRM', 'ACN', 'ADBE', 'CSCO', 'INTC', 'NFLX', 'DIS', 'CMCSA', 'VZ', 'T', 'AMZN', 'TSLA', 'HD', 'MCD', 'NKE', 'SBUX', 'TGT', 'LOW', 'WMT', 'PG', 'KO', 'PEP', 'COST', 'BRK-B', 'JPM', 'BAC', 'WFC', 'GS', 'MS', 'BLK', 'UNH', 'JNJ', 'PFE', 'ABBV', 'MRK', 'LLY', 'CAT', 'BA', 'HON', 'UPS', 'RTX', 'GE', 'XOM', 'CVX', 'COP', 'SLB', 'LIN', 'APD', 'ECL', 'PLD', 'AMT', 'CCI', 'OSW']


## Step 3: Extract Data from Yfinance into Pandas

In [3]:
start_date = '2020-01-01'
end_date = datetime.today().strftime('%Y-%m-%d')

# Download prices from yfinance
prices = yf.download(symbols, start=start_date, end=end_date, group_by='ticker')

# Check if the data has a MultiIndex (due to multiple symbols)
if isinstance(prices.columns, pd.MultiIndex):
    # Flatten the MultiIndex columns
    prices.columns = ['_'.join(filter(None, map(str, col))) for col in prices.columns]

# Reset index and melt the DataFrame to include a "symbol" column
prices = prices.copy()  # Avoid fragmentation issues
prices.reset_index(inplace=True)
prices = prices.melt(id_vars=["Date"], var_name="Metric", value_name="Value")
prices[["Symbol", "Metric"]] = prices["Metric"].str.extract(r'([^_]+)_(.+)')
prices = prices.pivot(index=["Date", "Symbol"], columns="Metric", values="Value").reset_index()

display(prices)

[                       0%                       ]

[*                      3%                       ]  2 of 61 completed

[**                     5%                       ]  3 of 61 completed

[***                    7%                       ]  4 of 61 completed

[****                   8%                       ]  5 of 61 completed

[*****                 10%                       ]  6 of 61 completed

[*****                 11%                       ]  7 of 61 completed

[******                13%                       ]  8 of 61 completed

[*******               15%                       ]  9 of 61 completed

[********              16%                       ]  10 of 61 completed

[*********             18%                       ]  11 of 61 completed

[**********            20%                       ]  12 of 61 completed

[**********            21%                       ]  13 of 61 completed

[***********           23%                       ]  14 of 61 completed

[************          25%                       ]  15 of 61 completed

[************          26%                       ]  16 of 61 completed

[*************         28%                       ]  17 of 61 completed

[**************        30%                       ]  18 of 61 completed

[***************       31%                       ]  19 of 61 completed

[****************      33%                       ]  20 of 61 completed

[****************      34%                       ]  21 of 61 completed

[*****************     36%                       ]  22 of 61 completed

[******************    38%                       ]  23 of 61 completed

[*******************   39%                       ]  24 of 61 completed

[********************  41%                       ]  25 of 61 completed

[********************* 43%                       ]  26 of 61 completed

[********************* 44%                       ]  27 of 61 completed

[**********************46%                       ]  28 of 61 completed

[**********************48%                       ]  29 of 61 completed

[**********************49%                       ]  30 of 61 completed

[**********************51%                       ]  31 of 61 completed

[**********************52%                       ]  32 of 61 completed

[**********************54%*                      ]  33 of 61 completed

[**********************56%**                     ]  34 of 61 completed

[**********************57%**                     ]  35 of 61 completed

[**********************59%***                    ]  36 of 61 completed

[**********************61%****                   ]  37 of 61 completed

[**********************62%*****                  ]  38 of 61 completed

[**********************64%******                 ]  39 of 61 completed

[**********************66%*******                ]  40 of 61 completed

[**********************67%*******                ]  41 of 61 completed

[**********************69%********               ]  42 of 61 completed

[**********************70%*********              ]  43 of 61 completed

[**********************72%**********             ]  44 of 61 completed

[**********************74%***********            ]  45 of 61 completed

[**********************75%***********            ]  46 of 61 completed

[**********************77%************           ]  47 of 61 completed

[**********************79%*************          ]  48 of 61 completed

[**********************80%*************          ]  49 of 61 completed

[**********************82%**************         ]  50 of 61 completed

[**********************84%***************        ]  51 of 61 completed

[**********************85%****************       ]  52 of 61 completed

[**********************87%*****************      ]  53 of 61 completed

[**********************89%******************     ]  54 of 61 completed

[**********************90%******************     ]  55 of 61 completed

[**********************92%*******************    ]  56 of 61 completed

[**********************93%********************   ]  57 of 61 completed

[**********************95%*********************  ]  58 of 61 completed

[**********************97%********************** ]  59 of 61 completed

[**********************98%********************** ]  60 of 61 completed

[*********************100%***********************]  61 of 61 completed



1 Failed download:


['NKE']: OperationalError('database is locked')


Metric,Date,Symbol,Adj Close,Close,High,Low,Open,Volume
0,2020-01-02,AAPL,,72.796043,72.856636,71.545410,71.799896,135480400.0
1,2020-01-02,ABBV,,71.589775,71.605761,70.758358,71.214038,5639200.0
2,2020-01-02,ACN,,195.263580,196.908203,194.018505,195.923292,2431100.0
3,2020-01-02,ADBE,,334.429993,334.480011,329.170013,330.000000,1990100.0
4,2020-01-02,AMT,,200.433945,202.758447,200.004125,201.469002,1426000.0
...,...,...,...,...,...,...,...,...
77770,2025-01-28,UPS,,135.419998,136.990005,135.350006,135.759995,3094000.0
77771,2025-01-28,VZ,,40.400002,40.810001,40.349998,40.599998,21030000.0
77772,2025-01-28,WFC,,77.879997,78.150002,77.169998,77.870003,11560800.0
77773,2025-01-28,WMT,,97.290001,97.839996,96.730003,97.230003,14636000.0


## Step 4: Convert Pandas to Polars

In [4]:
# Convert to Polars DataFrame
df = pl.from_pandas(prices)

# View the Polars DataFrame
print(df)

shape: (77_775, 8)
┌────────────┬────────┬───────────┬────────────┬────────────┬────────────┬────────────┬────────────┐
│ Date       ┆ Symbol ┆ Adj Close ┆ Close      ┆ High       ┆ Low        ┆ Open       ┆ Volume     │
│ ---        ┆ ---    ┆ ---       ┆ ---        ┆ ---        ┆ ---        ┆ ---        ┆ ---        │
│ datetime[n ┆ str    ┆ f64       ┆ f64        ┆ f64        ┆ f64        ┆ f64        ┆ f64        │
│ s]         ┆        ┆           ┆            ┆            ┆            ┆            ┆            │
╞════════════╪════════╪═══════════╪════════════╪════════════╪════════════╪════════════╪════════════╡
│ 2020-01-02 ┆ AAPL   ┆ null      ┆ 72.796043  ┆ 72.856636  ┆ 71.54541   ┆ 71.799896  ┆ 1.354804e8 │
│ 00:00:00   ┆        ┆           ┆            ┆            ┆            ┆            ┆            │
│ 2020-01-02 ┆ ABBV   ┆ null      ┆ 71.589775  ┆ 71.605761  ┆ 70.758358  ┆ 71.214038  ┆ 5.6392e6   │
│ 00:00:00   ┆        ┆           ┆            ┆            ┆           

## Step 5: Write Polars to Parquet

In [5]:
output_dir = "../../../data/finance"

# Write DataFrame to Parquet
# df.write_parquet(f'{output_dir}/historical_stock_quotes_{start_date}_to_{end_date}.parquet')
df.write_parquet(f'{output_dir}/historical_stock_quotes.parquet')

## Step 6: Read Parquet (Validate)

In [6]:
# pl.scan_parquet(f'{output_dir}/historical_stock_quotes_{start_date}_to_{end_date}.parquet').head().collect()
pl.scan_parquet(f'{output_dir}/historical_stock_quotes.parquet').head().collect()

Date,Symbol,Adj Close,Close,High,Low,Open,Volume
datetime[ns],str,f64,f64,f64,f64,f64,f64
2020-01-02 00:00:00,"""AAPL""",,72.796043,72.856636,71.54541,71.799896,135480400.0
2020-01-02 00:00:00,"""ABBV""",,71.589775,71.605761,70.758358,71.214038,5639200.0
2020-01-02 00:00:00,"""ACN""",,195.26358,196.908203,194.018505,195.923292,2431100.0
2020-01-02 00:00:00,"""ADBE""",,334.429993,334.480011,329.170013,330.0,1990100.0
2020-01-02 00:00:00,"""AMT""",,200.433945,202.758447,200.004125,201.469002,1426000.0
