## 1. Consolidate Symbols from All Price Data

Before fetching market cap data, we need a definitive list of symbols for which we have any price data. This script will scan both the `partials_price` (last 10 years) and `partials_price_30yr` (10-30 years ago) directories, extract all unique symbols, and save them into a combined list. This ensures we query the market cap API for every symbol relevant to our dataset.


In [None]:
import pandas as pd
import os

def create_symbol_list_from_price_partials():
    """
    Scans all price partials directories (10-year and 30-year) to create a 
    unique list of symbols that we have price data for.
    """
    partials_dirs = [
        '../Prices/Price Data/partials_price',
        '../Prices/Price Data/partials_price_30yr'
    ]
    output_path = 'symbols_for_market_cap.csv'
    
    all_symbols = set()

    for partials_dir in partials_dirs:
        if not os.path.isdir(partials_dir):
            print(f"Warning: Price partials directory not found at '{partials_dir}'. Skipping.")
            continue
        
        # Get symbol from each filename in the directory
        symbols_in_dir = [f.split('.')[0] for f in os.listdir(partials_dir) if f.endswith('.csv')]
        all_symbols.update(symbols_in_dir)

    if not all_symbols:
        print("No price data files found in any partials directory.")
        return
        
    # Create a DataFrame with unique symbols and save to CSV
    symbols_df = pd.DataFrame(list(all_symbols), columns=['symbol'])
    symbols_df.to_csv(output_path, index=False)
    
    print(f"Found {len(symbols_df)} unique symbols with price data across all periods.")
    print(f"Symbol list for market cap fetching saved to '{output_path}'.")

# Run the function
create_symbol_list_from_price_partials()



Found 10798 unique symbols with price data.
Symbol list for market cap fetching saved to 'symbols_for_market_cap.csv'.


## 2. Fetch Historical Market Cap Data

This script fetches the historical daily market cap for every symbol in our generated list. It's built to be robust and efficient, using the same asynchronous, resumable approach as the price and financial statement fetchers.

- **Concurrent:** It uses `aiohttp` to make up to 15 API requests at a time.
- **Resumable:** It saves data for each symbol to a file in the `partials` directory. If the script is stopped, it will automatically skip any symbols it has already completed.
- **Error Handling:** It will retry failed requests and, if a symbol consistently fails, it will be logged for review.


In [None]:
import pandas as pd
import os
from dotenv import load_dotenv
import asyncio
import aiohttp
from tqdm.notebook import tqdm
import nest_asyncio
from datetime import datetime, timedelta

# Apply nest_asyncio to allow running asyncio event loop in Jupyter
nest_asyncio.apply()

async def fetch_market_cap_and_save(session, symbol, pbar, partials_dir, api_key):
    """Fetches, processes, and saves daily market cap for a single symbol from the API."""
    ten_years_ago = (datetime.now() - timedelta(days=365*10)).strftime('%Y-%m-%d')
    today = datetime.now().strftime('%Y-%m-%d')
    
    url = f"https://financialmodelingprep.com/stable/historical-market-capitalization"
    params = {"symbol": symbol, "from": ten_years_ago, "to": today, "apikey": api_key}
    
    max_retries = 3
    attempts = 0
    while attempts < max_retries:
        try:
            async with session.get(url, params=params, timeout=60) as response:
                if response.status == 429:
                    retry_after = int(response.headers.get("Retry-After", 15))
                    pbar.write(f"Rate limit hit for {symbol}. Waiting {retry_after}s...")
                    await asyncio.sleep(retry_after)
                    continue

                response.raise_for_status()
                data = await response.json()
                
                if data:
                    df = pd.DataFrame(data)
                    if not df.empty:
                        output_path = os.path.join(partials_dir, f"{symbol}.csv")
                        df.to_csv(output_path, index=False)
                    return # Success
                else:
                    # No data found, but request was successful
                    return

        except Exception as e:
            attempts += 1
            if attempts < max_retries:
                wait_time = 2 * attempts
                pbar.write(f"Request for {symbol} failed with {e}. Retrying in {wait_time}s... (Attempt {attempts}/{max_retries})")
                await asyncio.sleep(wait_time)
            else:
                pbar.write(f"API fetch failed for {symbol} after {max_retries} retries. Final error: {e}")
    return

async def fetch_market_cap_main():
    """Main function to fetch API data for symbols with rate limiting."""
    # --- Configuration ---
    SYMBOL_LIST_PATH = 'symbols_for_market_cap.csv'
    PARTIALS_DIR = 'Market Cap Data/partials'
    FAILED_SYMBOLS_PATH = 'symbols_with_no_market_cap.csv'
    
    os.makedirs(PARTIALS_DIR, exist_ok=True)
    
    load_dotenv()
    API_KEY = os.getenv('API_KEY')
    if not API_KEY:
        print("API_KEY not found in .env file.")
        return

    try:
        symbols_to_fetch_df = pd.read_csv(SYMBOL_LIST_PATH)
    except FileNotFoundError:
        print(f"'{SYMBOL_LIST_PATH}' not found. Please run the symbol consolidation cell first.")
        return
        
    all_symbols = symbols_to_fetch_df['symbol'].unique().tolist()
    completed_symbols = {f.split('.')[0] for f in os.listdir(PARTIALS_DIR)}
    symbols_to_process = [s for s in all_symbols if s not in completed_symbols]

    if not symbols_to_process:
        print("All required symbols have already been processed. No API calls needed.")
        return

    print(f"Starting API fetch for {len(symbols_to_process)} symbols.")
    
    # --- Rate Limiting and Concurrency Control ---
    # To stay under the 2500 requests/minute limit, we'll target 40 requests/second.
    # This means we should start a new request no more frequently than every 1/40 = 0.025 seconds.
    API_CALL_INTERVAL = 1.0 / 40.0
    
    # The semaphore controls how many requests can be active at once.
    # This prevents overwhelming the server with connections and manages local resource usage.
    CONCURRENCY_LIMIT = 15
    sem = asyncio.Semaphore(CONCURRENCY_LIMIT)
    
    async with aiohttp.ClientSession() as session:
        with tqdm(total=len(symbols_to_process), desc="Fetching Market Caps") as pbar:
            
            async def fetch_with_sem(symbol):
                # This wrapper function acquires the semaphore before making the API call.
                async with sem:
                    await fetch_market_cap_and_save(session, symbol, pbar, PARTIALS_DIR, API_KEY)
                pbar.update(1)

            tasks = []
            for symbol in symbols_to_process:
                # Create a task for each symbol...
                task = asyncio.create_task(fetch_with_sem(symbol))
                tasks.append(task)
                # ...and then pause briefly before creating the next one.
                await asyncio.sleep(API_CALL_INTERVAL)
            
            # Wait for all the created tasks to complete.
            await asyncio.gather(*tasks)

    print("\\n--- API data fetching is complete. ---")

    # Check for any symbols that failed and output them
    final_completed_symbols = {f.split('.')[0] for f in os.listdir(PARTIALS_DIR)}
    failed_symbols = [s for s in all_symbols if s not in final_completed_symbols]

    if failed_symbols:
        failed_df = pd.DataFrame(failed_symbols, columns=['symbol'])
        failed_df.to_csv(FAILED_SYMBOLS_PATH, index=False)
        print(f"Identified {len(failed_symbols)} symbols that could not be fetched. Their list has been saved to '{FAILED_SYMBOLS_PATH}'.")
    else:
        print("All symbols were fetched successfully.")

# Run the API fetching process
await fetch_market_cap_main()



Starting API fetch for 10798 symbols.


Fetching Market Caps:   0%|          | 0/10798 [00:00<?, ?it/s]

Rate limit hit for ENB. Waiting 15s...
Rate limit hit for TELO. Waiting 15s...
Rate limit hit for NODK. Waiting 15s...
Rate limit hit for WFBI. Waiting 15s...
Rate limit hit for CONN. Waiting 15s...
Rate limit hit for CVE. Waiting 15s...
Rate limit hit for GCBC. Waiting 15s...
Rate limit hit for LLY. Waiting 15s...
Rate limit hit for ERII. Waiting 15s...
Rate limit hit for NYMX. Waiting 15s...
Rate limit hit for MGEN. Waiting 15s...
Rate limit hit for HEP. Waiting 15s...
Rate limit hit for BANC-PE. Waiting 15s...
Rate limit hit for AIRG. Waiting 15s...
Rate limit hit for APXTU. Waiting 15s...
Rate limit hit for ENB. Waiting 15s...
Rate limit hit for TELO. Waiting 15s...
Rate limit hit for WFBI. Waiting 15s...
Rate limit hit for NODK. Waiting 15s...
Rate limit hit for GCBC. Waiting 15s...
Rate limit hit for CONN. Waiting 15s...
Rate limit hit for CVE. Waiting 15s...
Rate limit hit for LLY. Waiting 15s...
Rate limit hit for ERII. Waiting 15s...
Rate limit hit for HEP. Waiting 15s...
Rate

## 3. Assemble Final Market Cap Reports

Once the fetching script is complete, run this final cell. It will gather all the individual symbol files from the `partials` directory and assemble them into four final, alphabetized CSV files inside the `Market Cap Data` directory.


In [3]:
import pandas as pd
import os
import glob

def assemble_market_cap_reports():
    """
    Assembles individual symbol market cap CSVs from the 'partials' directory
    into four final alphabetized batch files.
    """
    partials_dir = 'Market Cap Data/partials'
    output_dir = 'Market Cap Data'
    os.makedirs(output_dir, exist_ok=True)
    
    if not os.path.exists(partials_dir):
        print(f"Directory with partial files not found: {partials_dir}")
        return

    all_files = glob.glob(os.path.join(partials_dir, '*.csv'))
    if not all_files:
        print("No partial symbol files were found to assemble.")
        return

    print(f"Assembling data from {len(all_files)} individual symbol files...")
    
    # Use a generator to save memory
    df_generator = (pd.read_csv(f) for f in all_files)
    full_df = pd.concat(df_generator, ignore_index=True)
    
    if 'symbol' not in full_df.columns:
        print("Error: 'symbol' column not found in the assembled data. Cannot create batches.")
        return

    # Ensure symbol column is string for sorting
    full_df['symbol'] = full_df['symbol'].astype(str)

    # Split into alphabetical chunks
    symbol_chunks = {
        "A-F": full_df[full_df['symbol'].str[0].str.upper() <= 'F'],
        "G-L": full_df[(full_df['symbol'].str[0].str.upper() >= 'G') & (full_df['symbol'].str[0].str.upper() <= 'L')],
        "M-R": full_df[(full_df['symbol'].str[0].str.upper() >= 'M') & (full_df['symbol'].str[0].str.upper() <= 'R')],
        "S-Z": full_df[full_df['symbol'].str[0].str.upper() >= 'S']
    }

    for chunk_name, chunk_df in symbol_chunks.items():
        if not chunk_df.empty:
            output_path = os.path.join(output_dir, f"market_cap_data_{chunk_name}.csv")
            chunk_df.to_csv(output_path, index=False)
            print(f"Saved chunk '{chunk_name}' with {len(chunk_df)} records to {output_path}")
        else:
            print(f"No data for chunk '{chunk_name}'.")
    
    print("--- Assembly of final market cap reports is complete. ---")

# Run the assembly function
assemble_market_cap_reports()



Assembling data from 10532 individual symbol files...
Saved chunk 'A-F' with 5206722 records to Market Cap Data\market_cap_data_A-F.csv
Saved chunk 'G-L' with 2494634 records to Market Cap Data\market_cap_data_G-L.csv
Saved chunk 'M-R' with 3223399 records to Market Cap Data\market_cap_data_M-R.csv
Saved chunk 'S-Z' with 3036302 records to Market Cap Data\market_cap_data_S-Z.csv
--- Assembly of final market cap reports is complete. ---


## 4. Fetch Historical Market Cap Data (10-40 Years Ago)

This script fetches historical daily market cap data for the period from January 1, 1985, to 10 years before the present date. It uses the same robust, resumable, and concurrent process as the other fetching scripts.


In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
import asyncio
import aiohttp
from tqdm.notebook import tqdm
import nest_asyncio
from datetime import datetime, timedelta

# Apply nest_asyncio to allow running asyncio event loop in Jupyter
nest_asyncio.apply()

async def fetch_market_cap_and_save_30yr(session, symbol, pbar, partials_dir, api_key, from_date, to_date):
    """Fetches, processes, and saves daily market cap for a single symbol for the 10-30 year range."""
    url = f"https://financialmodelingprep.com/stable/historical-market-capitalization"
    params = {"symbol": symbol, "from": from_date, "to": to_date, "apikey": api_key}
    
    max_retries = 3
    attempts = 0
    while attempts < max_retries:
        try:
            async with session.get(url, params=params, timeout=60) as response:
                if response.status == 429:
                    retry_after = int(response.headers.get("Retry-After", 15))
                    pbar.write(f"Rate limit hit for {symbol}. Waiting {retry_after}s...")
                    await asyncio.sleep(retry_after)
                    continue

                response.raise_for_status()
                data = await response.json()
                
                if data:
                    df = pd.DataFrame(data)
                    if not df.empty:
                        output_path = os.path.join(partials_dir, f"{symbol}.csv")
                        df.to_csv(output_path, index=False)
                    return # Success
                else:
                    return # No data found, but request was successful

        except Exception as e:
            attempts += 1
            if attempts < max_retries:
                wait_time = 2 * attempts
                pbar.write(f"Request for {symbol} failed with {e}. Retrying in {wait_time}s... (Attempt {attempts}/{max_retries})")
                await asyncio.sleep(wait_time)
            else:
                pbar.write(f"API fetch failed for {symbol} after {max_retries} retries. Final error: {e}")
    return

async def fetch_market_cap_main_30yr():
    """Main function to fetch 10-30 year market cap data."""
    # --- Configuration ---
    SYMBOL_LIST_PATH = 'symbols_for_market_cap.csv'
    PARTIALS_DIR = 'Market Cap Data/partials_30yr'
    FAILED_SYMBOLS_PATH = 'symbols_with_no_market_cap_30yr.csv'
    
    os.makedirs(PARTIALS_DIR, exist_ok=True)
    
    load_dotenv()
    API_KEY = os.getenv('API_KEY')
    if not API_KEY:
        print("API_KEY not found in .env file.")
        return

    # 1. Define date range
    from_date = '1985-01-01'
    to_date = (datetime.now() - timedelta(days=365*10)).strftime('%Y-%m-%d')
    
    # 2. Load symbols and check against already completed ones
    try:
        symbols_to_fetch_df = pd.read_csv(SYMBOL_LIST_PATH)
    except FileNotFoundError:
        print(f"'{SYMBOL_LIST_PATH}' not found. Please run the symbol consolidation cell first.")
        return
        
    all_symbols = symbols_to_fetch_df['symbol'].unique().tolist()
    completed_symbols = {f.split('.')[0] for f in os.listdir(PARTIALS_DIR)}
    symbols_to_process = [s for s in all_symbols if s not in completed_symbols]

    if not symbols_to_process:
        print("All required symbols for the 10-30 year range have already been processed.")
        return

    print(f"Starting 10-30 year API fetch for {len(symbols_to_process)} symbols (from {from_date} to {to_date}).")
    
    # 3. Asynchronous fetching with rate limiting
    CONCURRENCY_LIMIT = 20
    API_CALL_INTERVAL = 0.077 # To stay safely under 3000 calls/min
    sem = asyncio.Semaphore(CONCURRENCY_LIMIT)
    
    async with aiohttp.ClientSession() as session:
        with tqdm(total=len(symbols_to_process), desc="Fetching 10-30yr Market Caps") as pbar:
            tasks = []
            
            async def fetch_with_sem(symbol):
                async with sem:
                    await fetch_market_cap_and_save_30yr(session, symbol, pbar, PARTIALS_DIR, API_KEY, from_date, to_date)
                pbar.update(1)

            for symbol in symbols_to_process:
                tasks.append(asyncio.create_task(fetch_with_sem(symbol)))
                await asyncio.sleep(API_CALL_INTERVAL)
            
            await asyncio.gather(*tasks)

    print("\n--- API data fetching for 10-30 year range is complete. ---")

    # 4. Check for any symbols that failed and save them
    final_completed_symbols = {f.split('.')[0] for f in os.listdir(PARTIALS_DIR)}
    failed_symbols = [s for s in all_symbols if s not in final_completed_symbols]

    if failed_symbols:
        failed_df = pd.DataFrame(failed_symbols, columns=['symbol'])
        failed_df.to_csv(FAILED_SYMBOLS_PATH, index=False)
        print(f"Identified {len(failed_symbols)} symbols that could not be fetched. List saved to '{FAILED_SYMBOLS_PATH}'.")
    else:
        print("All symbols were fetched successfully.")

# Run the API fetching process
await fetch_market_cap_main_30yr()



Starting 10-30 year API fetch for 10798 symbols (from 1985-01-01 to 2015-08-28).


Fetching 10-30yr Market Caps:   0%|          | 0/10798 [00:00<?, ?it/s]


--- API data fetching for 10-30 year range is complete. ---
Identified 6732 symbols that could not be fetched. List saved to 'symbols_with_no_market_cap_30yr.csv'.


## 5. Assemble 30-Year Price Data Reports

This final cell assembles the individual symbol files from the `partials_30yr` directory into four alphabetized CSV files. The final files will be stored in the `Market Cap Data` directory with a `30yr_` prefix.

In [2]:
import pandas as pd
import os
import glob

def assemble_market_cap_reports_30yr():
    """
    Assembles individual symbol market cap CSVs from the 'partials_30yr' directory
    into four final alphabetized batch files.
    """
    partials_dir = 'Market Cap Data/partials_30yr'
    output_dir = 'Market Cap Data'
    os.makedirs(output_dir, exist_ok=True)
    
    if not os.path.exists(partials_dir):
        print(f"Directory with partial files not found: {partials_dir}")
        return

    all_files = glob.glob(os.path.join(partials_dir, '*.csv'))
    if not all_files:
        print("No partial 30-year symbol files were found to assemble.")
        return

    print(f"Assembling 30-year data from {len(all_files)} individual symbol files...")
    
    # Use a generator to save memory
    df_generator = (pd.read_csv(f) for f in all_files)
    full_df = pd.concat(df_generator, ignore_index=True)
    
    if 'symbol' not in full_df.columns:
        print("Error: 'symbol' column not found in the assembled data. Cannot create batches.")
        return

    # Ensure symbol column is string for sorting
    full_df['symbol'] = full_df['symbol'].astype(str)

    # Split into alphabetical chunks
    symbol_chunks = {
        "A-F": full_df[full_df['symbol'].str[0].str.upper() <= 'F'],
        "G-L": full_df[(full_df['symbol'].str[0].str.upper() >= 'G') & (full_df['symbol'].str[0].str.upper() <= 'L')],
        "M-R": full_df[(full_df['symbol'].str[0].str.upper() >= 'M') & (full_df['symbol'].str[0].str.upper() <= 'R')],
        "S-Z": full_df[full_df['symbol'].str[0].str.upper() >= 'S']
    }

    for chunk_name, chunk_df in symbol_chunks.items():
        if not chunk_df.empty:
            output_path = os.path.join(output_dir, f"30yr_market_cap_data_{chunk_name}.csv")
            chunk_df.to_csv(output_path, index=False)
            print(f"Saved 30-year chunk '{chunk_name}' with {len(chunk_df)} records to {output_path}")
        else:
            print(f"No 30-year data for chunk '{chunk_name}'.")
    
    print("--- Assembly of final 30-year market cap reports is complete. ---")

# Run the assembly function
assemble_market_cap_reports_30yr()



Assembling 30-year data from 4066 individual symbol files...
Saved 30-year chunk 'A-F' with 4641300 records to Market Cap Data\30yr_market_cap_data_A-F.csv
Saved 30-year chunk 'G-L' with 2167286 records to Market Cap Data\30yr_market_cap_data_G-L.csv
Saved 30-year chunk 'M-R' with 2848765 records to Market Cap Data\30yr_market_cap_data_M-R.csv
Saved 30-year chunk 'S-Z' with 2669492 records to Market Cap Data\30yr_market_cap_data_S-Z.csv
--- Assembly of final 30-year market cap reports is complete. ---


## 6. Consolidate All Market Cap Data

This final step merges the 10-year and 30-year market cap data files. It reads each corresponding pair of chunked files (e.g., `market_cap_data_A-F.csv` and `30yr_market_cap_data_A-F.csv`), combines them, removes any duplicate rows based on the symbol and date, and overwrites the original 10-year file with the consolidated data. This leaves you with a single, complete set of market cap data spanning the last 30+ years.


In [3]:
import pandas as pd
import os

def consolidate_all_market_cap_data():
    """
    Merges the 10-year and 30-year market cap data files, removing duplicates.
    """
    data_dir = 'Market Cap Data'
    chunks = ["A-F", "G-L", "M-R", "S-Z"]
    
    print("--- Starting consolidation of all market cap data. ---")

    for chunk in chunks:
        ten_yr_file = os.path.join(data_dir, f"market_cap_data_{chunk}.csv")
        thirty_yr_file = os.path.join(data_dir, f"30yr_market_cap_data_{chunk}.csv")

        # Check if both files exist before trying to merge
        if not os.path.exists(ten_yr_file):
            print(f"Warning: Base file not found for chunk '{chunk}'. Skipping.")
            continue
        if not os.path.exists(thirty_yr_file):
            print(f"Warning: 30-year file not found for chunk '{chunk}'. Skipping.")
            continue
            
        print(f"Processing chunk '{chunk}'...")

        # Read both files
        ten_yr_df = pd.read_csv(ten_yr_file)
        thirty_yr_df = pd.read_csv(thirty_yr_file)
        
        # Concatenate, drop duplicates, and save
        combined_df = pd.concat([ten_yr_df, thirty_yr_df], ignore_index=True)
        
        initial_rows = len(combined_df)
        combined_df.drop_duplicates(subset=['symbol', 'date'], keep='first', inplace=True)
        final_rows = len(combined_df)
        
        # Overwrite the original 10-year file with the fully consolidated data
        combined_df.to_csv(ten_yr_file, index=False)
        
        print(f"  - Combined '{chunk}': {initial_rows} rows -> {final_rows} rows after deduplication.")
        print(f"  - Saved consolidated data to '{ten_yr_file}'.")

    print("--- Consolidation of all market cap reports is complete. ---")

# Run the consolidation function
consolidate_all_market_cap_data()



--- Starting consolidation of all market cap data. ---
Processing chunk 'A-F'...
  - Combined 'A-F': 9848022 rows -> 9843561 rows after deduplication.
  - Saved consolidated data to 'Market Cap Data\market_cap_data_A-F.csv'.
Processing chunk 'G-L'...
  - Combined 'G-L': 4661920 rows -> 4659782 rows after deduplication.
  - Saved consolidated data to 'Market Cap Data\market_cap_data_G-L.csv'.
Processing chunk 'M-R'...
  - Combined 'M-R': 6072164 rows -> 6069387 rows after deduplication.
  - Saved consolidated data to 'Market Cap Data\market_cap_data_M-R.csv'.
Processing chunk 'S-Z'...
  - Combined 'S-Z': 5705794 rows -> 5703161 rows after deduplication.
  - Saved consolidated data to 'Market Cap Data\market_cap_data_S-Z.csv'.
--- Consolidation of all market cap reports is complete. ---
