## Step 1: Import Libraries

In [1]:
%run ../make_clean_names.py

In [2]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import logging

import polars as pl
from typing import List, Dict, Any
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def create_session():
    session = requests.Session()
    retries = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    session.mount('https://', HTTPAdapter(max_retries=retries, pool_maxsize=10))
    return session

# Get API key from environment variables
FMP_API_KEY = os.getenv('FMP_API_KEY')
if not FMP_API_KEY:
    raise ValueError("FMP_API_KEY not found in environment variables")

## Step 2: Extract Data from FMP into Polars

In [3]:
# Fetch Historical S&P 500 Data

def fetch_historical_sp500(api_key: str, session: requests.Session) -> List[Dict]:
    """Fetch historical S&P 500 constituent data from FMP API"""
    url = "https://financialmodelingprep.com/api/v3/historical/sp500_constituent"
    params = {"apikey": api_key}
    
    try:
        response = session.get(url, params=params, timeout=10)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        logger.error(f"Error fetching historical S&P 500 data: {str(e)}")
        return []
    
session = create_session()

# Fetch the data
historical_sp500_data = fetch_historical_sp500(FMP_API_KEY, session)

# Convert to Polars DataFrame more efficiently
if historical_sp500_data:
    df = pl.DataFrame(historical_sp500_data)

## Step 3: Clean Column Names

In [4]:
df = make_clean_names(df)

## Step 4: Write Polars to Parquet

In [5]:
# Define the output directory
output_dir = "../../../data/finance"

# Write the processed DataFrame to a Parquet file
df.write_parquet(f'{output_dir}/historical_sp500_constituents.parquet')

## Step 5: Read Parquet (Validate)

In [6]:
# Validate the output by reading the Parquet file and displaying the first few rows
pl.scan_parquet(f'{output_dir}/historical_sp500_constituents.parquet').head().collect()

date_added,added_security,removed_ticker,removed_security,date,symbol,reason
str,str,str,str,str,str,str
"""December 23, 2024""","""Lennox International""","""CTLT""","""Catalent""","""2024-12-22""","""LII""","""Acquired by Novo Holdings A/S …"
"""November 26, 2024""","""Texas Pacific Land Corporation""","""MRO""","""Marathon Oil""","""2024-11-25""","""TPL""","""ConocoPhillips acquired Marath…"
"""September 30, 2024""","""Amentum""","""BBWI""","""Bath & Body Works, Inc.""","""2024-10-30""","""AMTM""","""Market capitalization change."""
"""September 23, 2024""","""Palantir Technologies""","""AAL""","""American Airlines Group""","""2024-09-23""","""PLTR""","""Market capitalization change."""
"""September 23, 2024""","""Erie Indemnity""","""BIO""","""Bio-Rad Laboratories""","""2024-09-23""","""ERIE""","""Market capitalization change."""
