# Rate-Limited FULL MARKET Data Collection (Year by Year)

This notebook collects financial data for **ALL US tickers** (~12,000) with strict rate limiting and saves each year separately.

**Key features:**
1. **Rate limiting**: 750 API calls per minute (75 tickers/minute max)
2. **Full market coverage**: ~12,000 US tickers (NYSE/NASDAQ)
3. **Year-by-year collection**: Each year saved to separate CSV
4. **Error tracking**: Detailed logs for debugging
5. **Manual control**: Run each year when you want
6. **Progress saving**: Can resume if interrupted

**Target columns:** `quarter`, `ticker`, `industry`, `sector`, `debt_to_assets`, `mkt_cap`, `stock_price`, `mkt_cap_rank`

**Total time estimate:** ~16 hours for all 6 years (2.7 hours per year)

# ⚠️ FULL MARKET COLLECTION WARNING

**This notebook is configured to collect data for ALL US tickers (~12,000 companies).**

**Time requirements:**
- **Per year:** ~2.7 hours (with 300 API calls/minute rate limiting)
- **All 6 years (2019-2024):** ~16 hours total
- **API calls:** ~288,000 total (48,000 per year)

**To start with a smaller test:**
1. Change `MAX_TICKERS = None` to `MAX_TICKERS = 100` in any collection cell
2. Run one year first to verify everything works
3. Then change back to `MAX_TICKERS = None` for full collection

**Pro tip:** Run each year separately and let them complete overnight or during long breaks.

---

In [None]:
import requests
import pandas as pd
import time
from typing import Optional, List, Dict, Any, Tuple
from datetime import datetime, timedelta
import json
import os

# Your API key
API = "7cNMpVzb43GKtm05iRTDWJtyJXSylX8J"

# Rate limiting configuration
API_CALLS_PER_MINUTE = 750
API_CALLS_PER_TICKER = 4  # balance sheet, market cap, price, profile
TICKERS_PER_MINUTE = API_CALLS_PER_MINUTE // API_CALLS_PER_TICKER  # 75 tickers/minute
SECONDS_PER_TICKER = 60 / TICKERS_PER_MINUTE  # 0.8 seconds per ticker

print(f"Rate limit configured: {TICKERS_PER_MINUTE} tickers/minute ({SECONDS_PER_TICKER:.2f} seconds/ticker)")

## Helper Functions with Rate Limiting

In [None]:
def get_json(url: str, params: Dict[str, Any] = {}) -> Optional[List[Dict]]:
    """Safely get JSON data from API with error handling and rate limit retry"""
    try:
        params["apikey"] = API
        response = requests.get(url, params=params, timeout=10)
        
        # Handle rate limiting
        if response.status_code == 429:
            print(f"⚠️  Rate limit hit! Waiting 30 seconds...")
            time.sleep(30)
            return get_json(url, params)  # Retry
            
        response.raise_for_status()
        js = response.json()
        
        # Handle different response formats
        if isinstance(js, dict) and "historical" in js:
            return js["historical"]
        elif isinstance(js, list):
            return js
        else:
            print(f"Unexpected response format: {type(js)}")
            return None
            
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error {e.response.status_code}: {e}")
        return None
    except Exception as e:
        print(f"Error fetching data: {e}")
        return None

In [None]:
def process_ticker_year(ticker: str, year: int) -> Tuple[Optional[pd.DataFrame], Dict[str, Any]]:
    """Process data for a single ticker for a specific year"""
    error_log = {"ticker": ticker, "year": year, "errors": []}
    
    try:
        # Calculate date range for the specific year
        start_date = datetime(year, 1, 1)
        end_date = datetime(year, 12, 31)
        
        # Get balance sheet data (API call 1)
        bs = get_json(
            f"https://financialmodelingprep.com/api/v3/balance-sheet-statement/{ticker}", 
            {"period": "quarter", "limit": 20}  # Get enough quarters
        )
        
        # Get market cap data (API call 2)
        mc = get_json(
            f"https://financialmodelingprep.com/api/v3/historical-market-capitalization/{ticker}", 
            {"from": start_date.strftime("%Y-%m-%d"), "to": end_date.strftime("%Y-%m-%d")}
        )
        
        # Get price data (API call 3)
        px = get_json(
            f"https://financialmodelingprep.com/api/v3/historical-price-full/{ticker}", 
            {"from": start_date.strftime("%Y-%m-%d"), "to": end_date.strftime("%Y-%m-%d")}
        )
        
        # Get company profile (API call 4)
        profile = get_json(f"https://financialmodelingprep.com/api/v3/profile/{ticker}")
        
        # Track missing data
        if not bs:
            error_log["errors"].append("No balance sheet data")
        if not mc:
            error_log["errors"].append("No market cap data")
        if not px:
            error_log["errors"].append("No price data")
        if not profile:
            error_log["errors"].append("No profile data")
            
        if not all([bs, mc, px, profile]):
            return None, error_log
        
        # Extract company info
        industry = profile[0].get("industry", "Unknown")
        sector = profile[0].get("sector", "Unknown")
        
        # Process balance sheet data
        bs_df = pd.DataFrame(bs)
        bs_df['date'] = pd.to_datetime(bs_df['date'])
        # Filter for specific year
        bs_df = bs_df[bs_df['date'].dt.year == year]
        
        if len(bs_df) == 0:
            error_log["errors"].append(f"No balance sheet data for year {year}")
            return None, error_log
            
        bs_df = (
            bs_df[['date', 'shortTermDebt', 'longTermDebt', 'totalAssets']]
            .assign(
                quarter=lambda d: d.date.dt.to_period("Q"),
                debt_to_assets=lambda d: (
                    (d.shortTermDebt.fillna(0) + d.longTermDebt.fillna(0)) / 
                    d.totalAssets.replace(0, pd.NA)
                )
            )
            .dropna(subset=["debt_to_assets"])
        )
        
        # Process market cap data
        mc_df = (
            pd.DataFrame(mc)
            .assign(
                date=lambda d: pd.to_datetime(d.date),
                quarter=lambda d: d.date.dt.to_period("Q")
            )
            .sort_values("date")
            .drop_duplicates("quarter", keep="last")
            .rename(columns={"marketCap": "mkt_cap"})
            [['quarter', 'mkt_cap']]
        )
        
        # Process price data (using adjusted close)
        px_df = (
            pd.DataFrame(px)
            .assign(
                date=lambda d: pd.to_datetime(d.date),
                quarter=lambda d: d.date.dt.to_period("Q")
            )
            .sort_values("date")
            .drop_duplicates("quarter", keep="last")
            .rename(columns={"adjClose": "stock_price"})
            [['quarter', 'stock_price']]
        )
        
        # Merge all data
        merged = (
            bs_df.merge(mc_df, on="quarter", how="left")
                 .merge(px_df, on="quarter", how="left")
                 .assign(ticker=ticker, industry=industry, sector=sector)
                 [['quarter', 'ticker', 'industry', 'sector', 'debt_to_assets', 'mkt_cap', 'stock_price']]
                 .dropna()
        )
        
        if len(merged) == 0:
            error_log["errors"].append("No valid data after merging")
            return None, error_log
            
        return merged, error_log
        
    except Exception as e:
        error_log["errors"].append(f"Exception: {str(e)}")
        return None, error_log

In [None]:
def collect_year_data(tickers: List[str], year: int, max_tickers: Optional[int] = None, 
                     save_progress: bool = True, progress_interval: int = 100) -> Tuple[pd.DataFrame, List[Dict]]:
    """Collect data for multiple tickers for a specific year with strict rate limiting"""
    all_data = []
    all_errors = []
    successful_tickers = []
    failed_tickers = []
    
    tickers_to_process = tickers[:max_tickers] if max_tickers else tickers
    total_tickers = len(tickers_to_process)
    
    print(f"\n{'='*70}")
    print(f"  COLLECTING DATA FOR YEAR {year}")
    print(f"{'='*70}")
    print(f"Total tickers: {total_tickers}")
    print(f"Estimated time: {(total_tickers * SECONDS_PER_TICKER / 60):.1f} minutes")
    print(f"API calls: {total_tickers * API_CALLS_PER_TICKER} (at {API_CALLS_PER_MINUTE}/minute)")
    print(f"Progress saves: Every {progress_interval} tickers")
    print(f"{'='*70}\n")
    
    start_time = time.time()
    
    for i, ticker in enumerate(tickers_to_process):
        ticker_start = time.time()
        
        # Progress update
        if i > 0 and i % 20 == 0:
            elapsed = time.time() - start_time
            avg_time = elapsed / i
            remaining = (total_tickers - i) * avg_time
            success_rate = len(successful_tickers) / i * 100
            
            print(f"\n[Progress: {i}/{total_tickers} ({i/total_tickers*100:.1f}%)]")
            print(f"  Time: {elapsed/60:.1f}min elapsed, ~{remaining/60:.1f}min remaining")
            print(f"  Success rate: {success_rate:.1f}% ({len(successful_tickers)}/{i})")
            print(f"  Current batch: ", end="")
        
        # Process ticker
        ticker_data, error_log = process_ticker_year(ticker, year)
        
        if ticker_data is not None and len(ticker_data) > 0:
            all_data.append(ticker_data)
            successful_tickers.append(ticker)
            print(f"✓", end="")
        else:
            failed_tickers.append(ticker)
            all_errors.append(error_log)
            print(f"✗", end="")
        
        # Save progress periodically
        if save_progress and (i + 1) % progress_interval == 0 and all_data:
            temp_df = pd.concat(all_data, ignore_index=True)
            temp_df['mkt_cap_rank'] = temp_df.groupby('quarter')['mkt_cap'].rank(method='dense', ascending=False).astype(int)
            progress_filename = f"progress_{year}_tickers_{i+1}.csv"
            temp_df.to_csv(progress_filename, index=False)
            print(f"\n  💾 Progress saved: {progress_filename} ({len(temp_df)} rows)")
        
        # Strict rate limiting
        ticker_elapsed = time.time() - ticker_start
        if ticker_elapsed < SECONDS_PER_TICKER:
            time.sleep(SECONDS_PER_TICKER - ticker_elapsed)
    
    # Final summary
    total_time = time.time() - start_time
    
    print(f"\n\n{'='*70}")
    print(f"  YEAR {year} COLLECTION COMPLETE")
    print(f"{'='*70}")
    print(f"Total time: {total_time/60:.1f} minutes ({total_time/3600:.2f} hours)")
    print(f"Successful: {len(successful_tickers)} tickers ({len(successful_tickers)/total_tickers*100:.1f}%)")
    print(f"Failed: {len(failed_tickers)} tickers")
    print(f"Actual rate: {total_tickers/total_time*60:.1f} tickers/minute")
    
    if len(all_data) == 0:
        print("\n⚠️  No data collected!")
        return pd.DataFrame(columns=["quarter", "ticker", "industry", "sector", 
                                    "debt_to_assets", "mkt_cap", "stock_price", "mkt_cap_rank"]), all_errors
    
    # Combine all data
    final_df = pd.concat(all_data, ignore_index=True)
    final_df = final_df.sort_values(['ticker', 'quarter']).reset_index(drop=True)
    
    # Add market cap ranking
    final_df['mkt_cap_rank'] = final_df.groupby('quarter')['mkt_cap'].rank(method='dense', ascending=False).astype(int)
    
    print(f"\n📊 Final dataset: {len(final_df)} rows, {final_df['ticker'].nunique()} tickers")
    print(f"   Quarters: {sorted(final_df['quarter'].unique())}")
    
    # Save error log
    if all_errors:
        error_filename = f"errors_{year}.json"
        with open(error_filename, 'w') as f:
            json.dump(all_errors, f, indent=2, default=str)
        print(f"\n📝 Error log saved: {error_filename} ({len(all_errors)} errors)")
    
    # Clean up progress files
    if save_progress:
        for progress_file in [f for f in os.listdir('.') if f.startswith(f'progress_{year}_')]:
            os.remove(progress_file)
        print(f"🧹 Cleaned up progress files")
    
    return final_df, all_errors

## Step 1: Get List of US Tickers

In [None]:
# Get list of US tickers
print("Fetching ticker list...")
tickers_data = get_json("https://financialmodelingprep.com/api/v3/stock/list")

if tickers_data:
    # Filter for US exchanges and remove penny stocks
    us_tickers = [
        d["symbol"] for d in tickers_data 
        if d["exchangeShortName"] in ["NYSE", "NASDAQ"] 
        and (d.get("price") is not None and d.get("price", 0) > 5)
        and len(d["symbol"]) <= 5
        and "." not in d["symbol"]
    ]
    
    print(f"✅ Found {len(us_tickers)} US tickers")
    print(f"   Sample: {us_tickers[:10]}")
else:
    print("❌ Failed to fetch ticker list. Using sample tickers.")
    us_tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "TSLA", "META", "NVDA", "JPM", "JNJ", "V"]

## Step 2: Test with Single Ticker

In [None]:
# Test with AAPL for 2023
print("Testing with AAPL for year 2023...")
test_start = time.time()

test_data, test_errors = process_ticker_year("AAPL", 2023)

test_time = time.time() - test_start
print(f"\nTest completed in {test_time:.2f} seconds")

if test_data is not None:
    print("\n✅ Test successful!")
    print(test_data)
    print(f"\nQuarters found: {sorted(test_data['quarter'].unique())}")
else:
    print("\n❌ Test failed!")
    print("Errors:", test_errors)

## Step 3: Collect Data for 2024

Collect data for ALL US tickers for 2024. This will take approximately **2.7 hours** with rate limiting.

**Note:** Since 2024 is ongoing, you may have partial data (Q1-Q3 or Q1-Q4 depending on current date).

**Time estimate:** ~12,000 tickers × 0.8 seconds = 2.7 hours

To test with fewer tickers first, change `MAX_TICKERS = None` to `MAX_TICKERS = 100`

In [None]:
# Collect 2024 data
YEAR = 2024
MAX_TICKERS = None  # Collect ALL US tickers (~12,000)
data_2024, errors_2024 = collect_year_data(us_tickers, year=YEAR, max_tickers=MAX_TICKERS)

if len(data_2024) > 0:
    filename = f"stock_data_{YEAR}.csv"
    data_2024.to_csv(filename, index=False)
    print(f"\n✅ Data saved to '{filename}'")
    
    # Show top companies (use latest quarter available)
    latest_quarter = data_2024['quarter'].max()
    latest_data = data_2024[data_2024['quarter'] == latest_quarter]
    if len(latest_data) > 0:
        print(f"\n🏆 Top 10 companies by market cap ({latest_quarter}):")
        top_10 = latest_data.nsmallest(10, 'mkt_cap_rank')[['ticker', 'mkt_cap_rank', 'mkt_cap', 'industry']]
        print(top_10.to_string(index=False))
    
    # Show available quarters
    print(f"\n📅 Available quarters for 2024: {sorted(data_2024['quarter'].unique())}")

## Step 4: Collect Data for 2023

In [None]:
# Collect 2023 data
YEAR = 2023
MAX_TICKERS = None  # Collect ALL US tickers (~12,000)

data_2023, errors_2023 = collect_year_data(us_tickers, year=YEAR, max_tickers=MAX_TICKERS)

if len(data_2023) > 0:
    filename = f"stock_data_{YEAR}.csv"
    data_2023.to_csv(filename, index=False)
    print(f"\n✅ Data saved to '{filename}'")
    
    # Show top companies
    q4_data = data_2023[data_2023['quarter'] == f'{YEAR}Q4']
    if len(q4_data) > 0:
        print(f"\n🏆 Top 10 companies by market cap (Q4 {YEAR}):")
        top_10 = q4_data.nsmallest(10, 'mkt_cap_rank')[['ticker', 'mkt_cap_rank', 'mkt_cap', 'industry']]
        print(top_10.to_string(index=False))

## Step 5: Collect Data for 2022

In [None]:
# Collect 2022 data
YEAR = 2022
MAX_TICKERS = None  # Collect ALL US tickers (~12,000)

data_2022, errors_2022 = collect_year_data(us_tickers, year=YEAR, max_tickers=MAX_TICKERS)

if len(data_2022) > 0:
    filename = f"stock_data_{YEAR}.csv"
    data_2022.to_csv(filename, index=False)
    print(f"\n✅ Data saved to '{filename}'")

## Step 6: Collect Data for 2021

In [None]:
# Collect 2021 data
YEAR = 2021
MAX_TICKERS = None  # Collect ALL US tickers (~12,000)

data_2021, errors_2021 = collect_year_data(us_tickers, year=YEAR, max_tickers=MAX_TICKERS)

if len(data_2021) > 0:
    filename = f"stock_data_{YEAR}.csv"
    data_2021.to_csv(filename, index=False)
    print(f"\n✅ Data saved to '{filename}'")

## Step 7: Collect Data for 2020

In [None]:
# Collect 2020 data
YEAR = 2020
MAX_TICKERS = None  # Collect ALL US tickers (~12,000)

data_2020, errors_2020 = collect_year_data(us_tickers, year=YEAR, max_tickers=MAX_TICKERS)

if len(data_2020) > 0:
    filename = f"stock_data_{YEAR}.csv"
    data_2020.to_csv(filename, index=False)
    print(f"\n✅ Data saved to '{filename}'")

## Step 8: Collect Data for 2019

In [None]:
# Collect 2019 data
YEAR = 2019
MAX_TICKERS = None  # Collect ALL US tickers (~12,000)

data_2019, errors_2019 = collect_year_data(us_tickers, year=YEAR, max_tickers=MAX_TICKERS)

if len(data_2019) > 0:
    filename = f"stock_data_{YEAR}.csv"
    data_2019.to_csv(filename, index=False)
    print(f"\n✅ Data saved to '{filename}'")

## Analysis: Review Collected Data

In [None]:
# Review what we've collected
import glob

print("📁 Available data files:")
data_files = sorted(glob.glob("stock_data_*.csv"))

total_rows = 0
for file in data_files:
    df = pd.read_csv(file)
    total_rows += len(df)
    print(f"  {file}: {len(df):,} rows, {df['ticker'].nunique()} tickers")

print(f"\n📊 Total: {total_rows:,} rows across {len(data_files)} files")

In [None]:
# Analyze errors
error_files = sorted(glob.glob("errors_*.json"))

if error_files:
    print("📝 Error analysis:")
    
    for error_file in error_files:
        with open(error_file, 'r') as f:
            errors = json.load(f)
        
        # Count error types
        error_types = {}
        for error in errors:
            for err_msg in error.get('errors', []):
                err_type = err_msg.split(':')[0] if ':' in err_msg else err_msg
                error_types[err_type] = error_types.get(err_type, 0) + 1
        
        print(f"\n{error_file}: {len(errors)} failed tickers")
        for err_type, count in sorted(error_types.items(), key=lambda x: x[1], reverse=True)[:5]:
            print(f"  - {err_type}: {count}")
else:
    print("No error files found.")

## Optional: Combine All Years

In [None]:
# Combine all years into one file
years = [2019, 2020, 2021, 2022, 2023, 2024]
all_data = []

for year in years:
    filename = f"stock_data_{year}.csv"
    if os.path.exists(filename):
        df = pd.read_csv(filename)
        df['quarter'] = pd.PeriodIndex(df['quarter'], freq='Q')
        all_data.append(df)
        print(f"✓ Loaded {year}: {len(df)} rows")
    else:
        print(f"✗ {filename} not found")

if all_data:
    combined = pd.concat(all_data, ignore_index=True)
    combined = combined.sort_values(['ticker', 'quarter']).reset_index(drop=True)
    
    # Recalculate rankings
    combined['mkt_cap_rank'] = combined.groupby('quarter')['mkt_cap'].rank(method='dense', ascending=False).astype(int)
    
    combined.to_csv("stock_data_combined_6years.csv", index=False)
    
    print(f"\n✅ Combined dataset saved!")
    print(f"   Total: {len(combined):,} rows")
    print(f"   Tickers: {combined['ticker'].nunique()}")
    print(f"   Period: {combined['quarter'].min()} to {combined['quarter'].max()}")