# Separated Data Collection - Stock Prices and Financial Statements

This notebook collects the **SAME DATA** as yearly.ipynb but splits it into two separate tables:

1. **Stock Price Data** (`stock_prices_YYYY.csv`):
   - Aligned to calendar quarter ends (March 31, June 30, Sept 30, Dec 31)
   - Contains: ticker, company_name, quarter_end_date, stock_price, market_cap, mkt_cap_rank, industry, sector, isETF, isFund

2. **Financial Statement Data** (`financial_statements_YYYY.csv`):
   - Aligned to company fiscal quarters with calendar date mapping
   - Contains: ticker, company_name, fiscal_quarter, fiscal_year, calendar_date, debt_to_assets, book_to_market, earnings_yield, industry, sector
   - Now captures all 4 fiscal quarters that overlap with the calendar year

**Key features (SAME as yearly.ipynb):**
- Market cap filter: Only collects data for stocks with market cap > $1B
- Rate limiting: 750 API calls per minute
- Year-by-year collection with historical ticker lists
- Batch processing for efficiency
- Error tracking and progress saves


## Helper Functions

In [1]:
import requests
import pandas as pd
import time
from typing import Optional, List, Dict, Any, Tuple
from datetime import datetime, timedelta
import json
import os
from dotenv import load_dotenv

# Load API key from .env file
load_dotenv(".env")
API = os.getenv("API")  

# Rate limiting configuration (SAME as yearly.ipynb)
API_CALLS_PER_MINUTE = 750
SECONDS_PER_CALL = 60 / API_CALLS_PER_MINUTE  # 0.08 seconds per call

# Session and timer for rate limiting
session = requests.Session()
LAST_API_CALL = 0.0

# Market cap threshold (1 billion) - SAME as yearly.ipynb
MARKET_CAP_THRESHOLD = 1e9

print(f"Rate limit configured: {API_CALLS_PER_MINUTE} calls/minute ({SECONDS_PER_CALL:.2f} seconds/call)")
print(f"Market cap filter: > ${MARKET_CAP_THRESHOLD/1e9:.0f}B")


Rate limit configured: 750 calls/minute (0.08 seconds/call)
Market cap filter: > $1B


In [2]:
# Core helper functions (EXACT SAME as yearly.ipynb)
def get_json(url: str, params: Dict[str, Any] = {}) -> Optional[Any]:
    """Safely get JSON data from API with error handling and rate limit retry"""
    global LAST_API_CALL, session
    try:
        params['apikey'] = API
        elapsed = time.time() - LAST_API_CALL
        if elapsed < SECONDS_PER_CALL:
            time.sleep(SECONDS_PER_CALL - elapsed)
        response = session.get(url, params=params, timeout=10)
        LAST_API_CALL = time.time()
        if response.status_code == 429:
            print('⚠️  Rate limit hit! Waiting 30 seconds...')
            time.sleep(30)
            return get_json(url, params)
        response.raise_for_status()
        js = response.json()
        if isinstance(js, dict) and 'historical' in js:
            return js['historical']
        elif isinstance(js, list):
            return js
        else:
            return js
    except requests.exceptions.HTTPError as e:
        print(f'HTTP Error {e.response.status_code}: {e}')
        return None
    except Exception as e:
        print(f'Error fetching data: {e}')
        return None

def check_market_cap(ticker: str, year: int, precomputed: Optional[float] = None) -> Tuple[bool, Optional[float]]:
    """Check if ticker had market cap above threshold in given year"""
    if precomputed is not None:
        return precomputed > MARKET_CAP_THRESHOLD, precomputed
    try:
        start_date = f'{year}-01-01'
        end_date = f'{year}-12-31'
        mc_data = get_json(
            f'https://financialmodelingprep.com/api/v3/historical-market-capitalization/{ticker}',
            {'from': start_date, 'to': end_date}
        )
        if not mc_data:
            return False, None
        mc_df = pd.DataFrame(mc_data)
        avg_market_cap = mc_df['marketCap'].mean()
        return avg_market_cap > MARKET_CAP_THRESHOLD, avg_market_cap
    except Exception as e:
        print(f'Error checking market cap for {ticker}: {e}')
        return False, None

def get_bulk_profiles(tickers: List[str]) -> Dict[str, Any]:
    """Fetch company profiles in bulk."""
    data = get_json(f'https://financialmodelingprep.com/api/v3/profile/{','.join(tickers)}')
    profiles = {}
    if isinstance(data, list):
        for item in data:
            symbol = item.get('symbol')
            profiles[symbol] = item
    return profiles


In [3]:
# CRITICAL FUNCTION - EXACT SAME LOGIC AS yearly.ipynb
def get_historical_tickers(year: int) -> List[str]:
    """Get list of US tickers that existed in a specific year"""
    print(f"Fetching ticker list for year {year}...")
    
    # Try to get historical ticker list from end of previous year
    date = f"{year-1}-12-31"
    
    # First try to get available stocks for that date
    available_stocks = get_json(
        f"https://financialmodelingprep.com/api/v3/available-traded/list",
        {"date": date}
    )
    
    if available_stocks:
        # Filter for US exchanges
        us_tickers = [
            stock["symbol"] for stock in available_stocks 
            if stock.get("exchangeShortName") in ["NYSE", "NASDAQ", "AMEX"]
            and len(stock["symbol"]) <= 5
            and "." not in stock["symbol"]
        ]
        print(f"✅ Found {len(us_tickers)} US tickers for {year}")
        return us_tickers
    
    # Fallback: use current ticker list with a warning
    print(f"⚠️  Could not get historical ticker list for {year}, using current list")
    tickers_data = get_json("https://financialmodelingprep.com/api/v3/stock/list")
    
    if tickers_data:
        # Filter for US exchanges and remove penny stocks
        us_tickers = [
            d["symbol"] for d in tickers_data 
            if d["exchangeShortName"] in ["NYSE", "NASDAQ"] 
            and (d.get("price") is not None and d.get("price", 0) > 5)
            and len(d["symbol"]) <= 5
            and "." not in d["symbol"]
        ]
        
        print(f"✅ Found {len(us_tickers)} current US tickers")
        return us_tickers
    else:
        print("❌ Failed to fetch ticker list. Using sample tickers.")
        return ["AAPL", "MSFT", "GOOGL", "AMZN", "TSLA", "META", "NVDA", "JPM", "JNJ", "V"]


In [10]:
# FIXED process_ticker_year to return separated data with proper fiscal quarter handling
def process_ticker_year_separated(ticker: str, year: int, profile_data: Optional[Dict[str, Any]] = None, 
                                 avg_market_cap: Optional[float] = None) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame], Dict[str, Any], int]:
    """Process data for a single ticker for a specific year - returns separated price and statement data"""
    error_log = {'ticker': ticker, 'year': year, 'errors': []}
    api_calls = 0
    
    try:
        # Check market cap (SAME as yearly.ipynb)
        is_large_cap, avg_market_cap = check_market_cap(ticker, year, precomputed=avg_market_cap)
        if avg_market_cap is None:
            api_calls += 1
        
        if not is_large_cap:
            error_log['errors'].append(f'Market cap below threshold (avg: ${avg_market_cap:,.0f})')
            return None, None, error_log, api_calls
        
        start_date = datetime(year, 1, 1)
        end_date = datetime(year, 12, 31)
        
        # Get all the data - use broader date range for financial statements
        # Extend date range for financial statements to capture fiscal years
        fs_start_date = datetime(year - 1, 1, 1)
        fs_end_date = datetime(year + 1, 12, 31)
        
        bs = get_json(f'https://financialmodelingprep.com/api/v3/balance-sheet-statement/{ticker}', 
                     {'period': 'quarter', 'limit': 100})
        api_calls += 1
        
        inc = get_json(f'https://financialmodelingprep.com/api/v3/income-statement/{ticker}', 
                      {'period': 'quarter', 'limit': 100})
        api_calls += 1
        
        # Get market cap data for broader range to cover all fiscal quarters
        mc = get_json(f'https://financialmodelingprep.com/api/v3/historical-market-capitalization/{ticker}', 
                     {'from': fs_start_date.strftime('%Y-%m-%d'), 'to': fs_end_date.strftime('%Y-%m-%d')})
        api_calls += 1
        
        # Get price data for broader range to cover all fiscal quarters
        px = get_json(f'https://financialmodelingprep.com/api/v3/historical-price-full/{ticker}', 
                     {'from': fs_start_date.strftime('%Y-%m-%d'), 'to': fs_end_date.strftime('%Y-%m-%d')})
        api_calls += 1
        
        if profile_data is None:
            profile = get_json(f'https://financialmodelingprep.com/api/v3/profile/{ticker}')
            api_calls += 1
        else:
            profile = [profile_data] if isinstance(profile_data, dict) else profile_data
        
        if not all([bs, inc, mc, px, profile]):
            if not bs: error_log['errors'].append('No balance sheet data')
            if not inc: error_log['errors'].append('No income statement data')
            if not mc: error_log['errors'].append('No market cap data')
            if not px: error_log['errors'].append('No price data')
            if not profile: error_log['errors'].append('No profile data')
            return None, None, error_log, api_calls
        
        # Extract profile info
        profile_info = profile[0] if profile and len(profile) > 0 else {}
        company_name = profile_info.get('companyName', '')
        industry = profile_info.get('industry', 'Unknown')
        sector = profile_info.get('sector', 'Unknown')
        is_etf = profile_info.get('isEtf', False)
        is_fund = profile_info.get('isFund', False)
        
        # Process all data
        bs_df = pd.DataFrame(bs)
        bs_df['date'] = pd.to_datetime(bs_df['date'])
        
        inc_df = pd.DataFrame(inc)
        inc_df['date'] = pd.to_datetime(inc_df['date'])
        
        mc_df = pd.DataFrame(mc)
        mc_df['date'] = pd.to_datetime(mc_df['date'])
        
        px_df = pd.DataFrame(px)
        px_df['date'] = pd.to_datetime(px_df['date'])
        
        # Create calendar quarter end dates
        quarter_dates = [
            f"{year}-03-31",
            f"{year}-06-30",
            f"{year}-09-30",
            f"{year}-12-31"
        ]
        
        # 1. Create Stock Price Data (aligned to calendar quarters)
        price_data_list = []
        for quarter_date in quarter_dates:
            # Find closest price to quarter end
            quarter_dt = pd.to_datetime(quarter_date)
            px_quarter = px_df[abs(px_df['date'] - quarter_dt) <= pd.Timedelta(days=7)]
            
            if len(px_quarter) > 0:
                # Get closest date
                closest_idx = abs(px_quarter['date'] - quarter_dt).idxmin()
                price_row = px_quarter.loc[closest_idx]
                
                # Get market cap for this date
                mc_quarter = mc_df[abs(mc_df['date'] - quarter_dt) <= pd.Timedelta(days=7)]
                if len(mc_quarter) > 0:
                    closest_mc_idx = abs(mc_quarter['date'] - quarter_dt).idxmin()
                    market_cap = mc_quarter.loc[closest_mc_idx, 'marketCap']
                else:
                    market_cap = None
                
                if market_cap and market_cap >= MARKET_CAP_THRESHOLD:
                    price_data_list.append({
                        'ticker': ticker,
                        'company_name': company_name,
                        'quarter_end_date': quarter_date,
                        'stock_price': price_row['adjClose'],
                        'market_cap': market_cap,
                        'industry': industry,
                        'sector': sector,
                        'isETF': is_etf,
                        'isFund': is_fund
                    })
        
        # 2. Create Financial Statement Data (find best 4 quarters for the calendar year)
        statement_data_list = []
        
        # Merge balance sheet and income statement by date
        bs_quarters = bs_df[['date', 'shortTermDebt', 'longTermDebt', 'totalAssets', 
                             'totalStockholdersEquity', 'commonStock']].copy()
        inc_quarters = inc_df[['date', 'eps', 'weightedAverageShsOut', 'period', 
                              'calendarYear', 'netIncome']].copy()
        
        # Join on date
        merged_statements = pd.merge(bs_quarters, inc_quarters, on='date', how='inner')
        
        if len(merged_statements) == 0:
            # If no merged statements, just return price data
            price_df = pd.DataFrame(price_data_list) if price_data_list else None
            return price_df, None, error_log, api_calls
        
        # Sort by date
        merged_statements = merged_statements.sort_values('date')
        
        # Find the 4 quarters that best overlap with the calendar year
        # Score each quarter based on how well it represents the calendar year
        scored_quarters = []
        for _, row in merged_statements.iterrows():
            quarter_date = row['date']
            
            # Calculate relevance score for this quarter to the calendar year
            # Quarters ending in the calendar year get highest score
            if quarter_date.year == year:
                if quarter_date.month in [3, 6, 9, 12]:  # Standard quarters
                    score = 10 + (12 - abs(quarter_date.month - 6))  # Prefer middle of year
                else:
                    score = 8
            # Quarters ending in Q1 of following year (for companies with Dec fiscal year)
            elif quarter_date.year == year + 1 and quarter_date.month <= 3:
                score = 7
            # Quarters ending in Q4 of previous year (for companies with early fiscal year)
            elif quarter_date.year == year - 1 and quarter_date.month >= 10:
                score = 6
            else:
                score = 0
            
            if score > 0:
                scored_quarters.append((score, row))
        
        # Sort by score (descending) and take top 4
        scored_quarters.sort(key=lambda x: x[0], reverse=True)
        top_quarters = [quarter for score, quarter in scored_quarters[:4]]
        
        for row in top_quarters:
            fiscal_date = row['date'].strftime('%Y-%m-%d')
            
            # Get market cap for this fiscal date using the broader dataset we already have
            mc_fiscal = mc_df[abs(mc_df['date'] - row['date']) <= pd.Timedelta(days=10)]
            if len(mc_fiscal) > 0:
                closest_mc_idx = abs(mc_fiscal['date'] - row['date']).idxmin()
                market_cap = mc_fiscal.loc[closest_mc_idx, 'marketCap']
            else:
                # Skip this quarter if no market cap data
                continue
            
            if market_cap < MARKET_CAP_THRESHOLD:
                continue
            
            # Get stock price for this date using the broader dataset we already have  
            px_fiscal = px_df[abs(px_df['date'] - row['date']) <= pd.Timedelta(days=10)]
            if len(px_fiscal) > 0:
                closest_px_idx = abs(px_fiscal['date'] - row['date']).idxmin()
                stock_price = px_fiscal.loc[closest_px_idx, 'adjClose']
            else:
                # Skip this quarter if no price data
                continue
            
            # Calculate ratios
            total_debt = (row['shortTermDebt'] or 0) + (row['longTermDebt'] or 0)
            debt_to_assets = total_debt / row['totalAssets'] if row['totalAssets'] > 0 else None
            
            if stock_price and row['weightedAverageShsOut'] > 0:
                book_to_market = (row['totalStockholdersEquity'] / row['weightedAverageShsOut']) / stock_price
                earnings_yield = row['eps'] / stock_price if row['eps'] is not None else None
            else:
                book_to_market = None
                earnings_yield = None
            
            statement_data_list.append({
                'ticker': ticker,
                'company_name': company_name,
                'fiscal_quarter': row['period'],
                'fiscal_year': row['calendarYear'],
                'calendar_date': fiscal_date,
                'debt_to_assets': debt_to_assets,
                'book_to_market': book_to_market,
                'earnings_yield': earnings_yield,
                'industry': industry,
                'sector': sector
            })
        
        # Convert to DataFrames
        price_df = pd.DataFrame(price_data_list) if price_data_list else None
        statement_df = pd.DataFrame(statement_data_list) if statement_data_list else None
        
        if price_df is None and statement_df is None:
            error_log['errors'].append('No valid data after processing')
            return None, None, error_log, api_calls
        
        return price_df, statement_df, error_log, api_calls
        
    except Exception as e:
        error_log['errors'].append(f'Exception: {str(e)}')
        return None, None, error_log, api_calls


In [11]:
# Main collection function - modified from yearly.ipynb to handle separated data
def collect_year_data_separated(tickers: List[str], year: int, max_tickers: Optional[int] = None, 
                               save_progress: bool = True, progress_interval: int = 100, 
                               batch_size: int = 50) -> Tuple[pd.DataFrame, pd.DataFrame, List[Dict]]:
    """Collect separated price and statement data for multiple tickers for a specific year"""
    all_price_data = []
    all_statement_data = []
    all_errors = []
    successful_tickers = []
    failed_tickers = []
    skipped_tickers = []
    total_api_calls = 0
    
    tickers_to_process = tickers[:max_tickers] if max_tickers else tickers
    total_tickers = len(tickers_to_process)
    
    print(f"\n{'='*70}")
    print(f"  COLLECTING SEPARATED DATA FOR YEAR {year}")
    print(f"{'='*70}")
    print(f"Total tickers to check: {total_tickers}")
    print(f"Market cap filter: >${MARKET_CAP_THRESHOLD/1e9:.0f}B")
    print(f"API rate limit: {API_CALLS_PER_MINUTE} calls/minute")
    print(f"Batch size: {batch_size} tickers")
    print(f"Progress saves: Every {progress_interval} tickers")
    print(f"{'='*70}\n")
    
    start_time = time.time()
    
    # Process tickers in batches (SAME as yearly.ipynb)
    for batch_start in range(0, total_tickers, batch_size):
        batch_end = min(batch_start + batch_size, total_tickers)
        batch_tickers = tickers_to_process[batch_start:batch_end]
        
        # Progress update
        if batch_start > 0:
            elapsed = time.time() - start_time
            avg_time = elapsed / batch_start
            remaining = (total_tickers - batch_start) * avg_time
            
            print(f"\n[Progress: {batch_start}/{total_tickers} ({batch_start/total_tickers*100:.1f}%)]")
            print(f"  Time: {elapsed/60:.1f}min elapsed, ~{remaining/60:.1f}min remaining")
            print(f"  Success: {len(successful_tickers)}, Failed: {len(failed_tickers)}, Skipped (small cap): {len(skipped_tickers)}")
            print(f"  API calls: {total_api_calls} ({total_api_calls/elapsed*60:.0f}/minute avg)")
        
        print(f"\n  Processing batch {batch_start//batch_size + 1}: tickers {batch_start+1}-{batch_end}")
        
        # Get bulk profiles for the batch (1 API call for up to 50 tickers)
        profiles = get_bulk_profiles(batch_tickers)
        total_api_calls += 1
        
        # Process each ticker in the batch
        for i, ticker in enumerate(batch_tickers):
            profile_data = profiles.get(ticker)
            
            # Process ticker with pre-fetched profile
            price_data, statement_data, error_log, api_calls = process_ticker_year_separated(
                ticker, year, profile_data=profile_data
            )
            total_api_calls += api_calls
            
            if (price_data is not None and len(price_data) > 0) or (statement_data is not None and len(statement_data) > 0):
                if price_data is not None:
                    all_price_data.append(price_data)
                if statement_data is not None:
                    all_statement_data.append(statement_data)
                successful_tickers.append(ticker)
                print("✓", end="", flush=True)
            elif any("Market cap below threshold" in err for err in error_log.get("errors", [])):
                skipped_tickers.append(ticker)
                print("○", end="", flush=True)
            else:
                failed_tickers.append(ticker)
                all_errors.append(error_log)
                print("✗", end="", flush=True)
        
        # Save progress periodically
        if save_progress and (batch_end % progress_interval == 0 or batch_end == total_tickers):
            if all_price_data:
                temp_price_df = pd.concat(all_price_data, ignore_index=True)
                temp_price_df['mkt_cap_rank'] = temp_price_df.groupby('quarter_end_date')['market_cap'].rank(
                    method='dense', ascending=False).astype(int)
                progress_price_filename = f"progress_prices_{year}_tickers_{batch_end}.csv"
                temp_price_df.to_csv(progress_price_filename, index=False)
                print(f"\n  💾 Price progress saved: {progress_price_filename} ({len(temp_price_df)} rows)")
            
            if all_statement_data:
                temp_statement_df = pd.concat(all_statement_data, ignore_index=True)
                progress_statement_filename = f"progress_statements_{year}_tickers_{batch_end}.csv"
                temp_statement_df.to_csv(progress_statement_filename, index=False)
                print(f"  💾 Statement progress saved: {progress_statement_filename} ({len(temp_statement_df)} rows)")
    
    # Final summary
    total_time = time.time() - start_time
    
    print(f"\n\n{'='*70}")
    print(f"  YEAR {year} COLLECTION COMPLETE")
    print(f"{'='*70}")
    print(f"Total time: {total_time/60:.1f} minutes ({total_time/3600:.2f} hours)")
    print(f"Successful: {len(successful_tickers)} tickers")
    print(f"Failed: {len(failed_tickers)} tickers")
    print(f"Skipped (small cap): {len(skipped_tickers)} tickers")
    print(f"Total API calls: {total_api_calls:,} ({total_api_calls/total_time*60:.0f}/minute avg)")
    
    # Combine all data
    if all_price_data:
        final_price_df = pd.concat(all_price_data, ignore_index=True)
        # Add market cap ranking
        final_price_df['mkt_cap_rank'] = final_price_df.groupby('quarter_end_date')['market_cap'].rank(
            method='dense', ascending=False).astype(int)
        # Sort by ticker and quarter
        final_price_df = final_price_df.sort_values(['ticker', 'quarter_end_date']).reset_index(drop=True)
    else:
        final_price_df = pd.DataFrame()
    
    if all_statement_data:
        final_statement_df = pd.concat(all_statement_data, ignore_index=True)
        # Sort by ticker and date
        final_statement_df = final_statement_df.sort_values(['ticker', 'calendar_date']).reset_index(drop=True)
    else:
        final_statement_df = pd.DataFrame()
    
    print(f"\n📊 Final datasets:")
    print(f"   Price data: {len(final_price_df)} rows, {final_price_df['ticker'].nunique() if len(final_price_df) > 0 else 0} tickers")
    print(f"   Statement data: {len(final_statement_df)} rows, {final_statement_df['ticker'].nunique() if len(final_statement_df) > 0 else 0} tickers")
    
    # Save error log
    if all_errors:
        error_filename = f"errors_{year}.json"
        with open(error_filename, 'w') as f:
            json.dump(all_errors, f, indent=2, default=str)
        print(f"\n📝 Error log saved: {error_filename} ({len(all_errors)} errors)")
    
    # Clean up progress files
    if save_progress:
        for progress_file in [f for f in os.listdir('.') if f.startswith(f'progress_prices_{year}_') or f.startswith(f'progress_statements_{year}_')]:
            os.remove(progress_file)
        print(f"🧹 Cleaned up progress files")
    
    return final_price_df, final_statement_df, all_errors


In [12]:
# Git commands to sync and push data
import subprocess
import os

def git_push_year_data(year: int):
    """Push year data to GitHub with proper branch detection"""
    try:
        print("🔄 Syncing with remote repository...")
        
        # Get current branch name (run from parent directory)
        result = subprocess.run(["git", "branch", "--show-current"], 
                              capture_output=True, text=True, check=True, cwd="..")
        current_branch = result.stdout.strip()
        
        # Add files with proper paths (files are in yearly runs directory)
        files_to_add = [f"yearly runs/stock_prices_{year}.csv", 
                       f"yearly runs/financial_statements_{year}.csv", 
                       f"yearly runs/errors_{year}.json"]
        
        # Check which files exist
        existing_files = []
        for file_path in files_to_add:
            if os.path.exists(f"../{file_path}"):  # Check from notebook's perspective
                existing_files.append(file_path)
        
        if existing_files:
            print(f"📁 Adding files: {existing_files}")
            subprocess.run(["git", "add"] + existing_files, check=True, cwd="..")
            
            print(f"💾 Committing: Add separated data for {year}")
            subprocess.run(["git", "commit", "-m", f"Add separated data for {year}"], check=True, cwd="..")
            
            print("🚀 Pushing to GitHub...")
            subprocess.run(["git", "push", "origin", current_branch], check=True, cwd="..")
            
            print(f"✅ Successfully pushed data for {year} to {current_branch} branch!")
        else:
            print(f"❌ No files found to push for {year}")
            print(f"   Looking for: {files_to_add}")
        
    except subprocess.CalledProcessError as e:
        print(f"❌ Git operation failed: {e}")
        print("💡 You may need to resolve conflicts manually")
    except Exception as e:
        print(f"❌ Error: {e}")

# Call this after collecting data for a year
# git_push_year_data(2020)

## Test with Single Ticker


In [13]:
# Test with a single ticker (SAME approach as yearly.ipynb)
def test_single_ticker(ticker: str, year: int):
    """Test data collection for a single ticker"""
    print(f"Testing with {ticker} for year {year}...")
    test_start = time.time()
    
    # Get profile first
    profile_data = get_bulk_profiles([ticker]).get(ticker)
    
    # Process ticker
    price_data, statement_data, error_log, api_calls = process_ticker_year_separated(ticker, year, profile_data)
    
    test_time = time.time() - test_start
    print(f"\nTest completed in {test_time:.2f} seconds with {api_calls} API calls")
    
    if price_data is not None:
        print(f"\n✅ Price data collected: {len(price_data)} records")
        print(price_data)
    else:
        print("\n❌ No price data collected")
    
    if statement_data is not None:
        print(f"\n✅ Statement data collected: {len(statement_data)} records")
        print(statement_data)
    else:
        print("\n❌ No statement data collected")
    
    if error_log['errors']:
        print(f"\nErrors: {error_log}")

# Test with AAPL for 2018 to see if we get all 4 quarters
test_single_ticker("AAPL", 2005)

print("\n" + "="*80)
print("Now testing AAPL for 2012 (earliest available year) to verify older year functionality...")
print("="*80)

# Test with AAPL for 2012 to see if we get all 4 quarters (2005 data doesn't exist, API only goes back to 2012)
#test_single_ticker("AAPL", 2012)


Testing with AAPL for year 2005...

Test completed in 1.27 seconds with 4 API calls

✅ Price data collected: 4 records
  ticker company_name quarter_end_date  stock_price   market_cap  \
0   AAPL   Apple Inc.       2005-03-31         1.25  34005638240   
1   AAPL   Apple Inc.       2005-06-30         1.11  30131226160   
2   AAPL   Apple Inc.       2005-09-30         1.61  44430167880   
3   AAPL   Apple Inc.       2005-12-31         2.16  59783000760   

               industry      sector  isETF  isFund  
0  Consumer Electronics  Technology  False   False  
1  Consumer Electronics  Technology  False   False  
2  Consumer Electronics  Technology  False   False  
3  Consumer Electronics  Technology  False   False  

✅ Statement data collected: 4 records
  ticker company_name fiscal_quarter fiscal_year calendar_date  \
0   AAPL   Apple Inc.             Q3        2005    2005-06-25   
1   AAPL   Apple Inc.             Q2        2005    2005-03-26   
2   AAPL   Apple Inc.             Q4  

## Collect Data for Years


In [37]:
# Collect data for a specific year (SAME approach as yearly.ipynb)
def collect_and_save_year(year: int, max_tickers: Optional[int] = None):
    """Collect and save separated data for a specific year"""
    
    # Get historical ticker list for the year (EXACT SAME as yearly.ipynb)
    us_tickers = get_historical_tickers(year)
    
    # Collect data with optimized batch processing
    price_df, statement_df, errors = collect_year_data_separated(
        us_tickers, year=year, max_tickers=max_tickers
    )
    
    # Save the data
    if len(price_df) > 0:
        price_filename = f"stock_prices_{year}.csv"
        price_df.to_csv(price_filename, index=False)
        print(f"\n✅ Price data saved to '{price_filename}'")
        
        # Show summary statistics
        print(f"\n📈 Price Data Summary:")
        print(f"   Records: {len(price_df)}")
        print(f"   Unique tickers: {price_df['ticker'].nunique()}")
        print(f"   Date range: {price_df['quarter_end_date'].min()} to {price_df['quarter_end_date'].max()}")
        
        # Show top companies by market cap
        latest_quarter = price_df['quarter_end_date'].max()
        latest_data = price_df[price_df['quarter_end_date'] == latest_quarter]
        if len(latest_data) > 0:
            print(f"\n🏆 Top 10 companies by market cap ({latest_quarter}):")
            top_10 = latest_data.nsmallest(10, 'mkt_cap_rank')[['ticker', 'company_name', 'mkt_cap_rank', 'market_cap', 'isETF', 'isFund']]
            top_10['market_cap'] = top_10['market_cap'].apply(lambda x: f"${x/1e9:.1f}B")
            print(top_10.to_string(index=False))
    
    if len(statement_df) > 0:
        statement_filename = f"financial_statements_{year}.csv"
        statement_df.to_csv(statement_filename, index=False)
        print(f"\n✅ Statement data saved to '{statement_filename}'")
        
        # Show summary statistics
        print(f"\n📊 Statement Data Summary:")
        print(f"   Records: {len(statement_df)}")
        print(f"   Unique tickers: {statement_df['ticker'].nunique()}")
        print(f"   Date range: {statement_df['calendar_date'].min()} to {statement_df['calendar_date'].max()}")
        print(f"   Fiscal years included: {sorted(statement_df['fiscal_year'].unique())}")
        print(f"   Debt/Assets - Mean: {statement_df['debt_to_assets'].mean():.3f}, Median: {statement_df['debt_to_assets'].median():.3f}")
        print(f"   Book/Market - Mean: {statement_df['book_to_market'].mean():.3f}, Median: {statement_df['book_to_market'].median():.3f}")
        print(f"   Earnings Yield - Mean: {statement_df['earnings_yield'].mean():.3f}, Median: {statement_df['earnings_yield'].median():.3f}")

# Example: Collect 2024 data
# To test with fewer tickers first, use max_tickers parameter
# collect_and_save_year(2024, max_tickers=100)  # Test with 100 tickers
# collect_and_save_year(2024)  # Full collection


In [38]:
# Collect data for multiple years
def collect_multiple_years(start_year: int, end_year: int, max_tickers: Optional[int] = None):
    """Collect data for a range of years"""
    for year in range(start_year, end_year + 1):
        print(f"\n{'='*80}")
        print(f"{'='*80}")
        print(f"  STARTING COLLECTION FOR YEAR {year}")
        print(f"{'='*80}")
        print(f"{'='*80}")
        
        try:
            collect_and_save_year(year, max_tickers=max_tickers)
        except Exception as e:
            print(f"❌ Failed to collect data for {year}: {e}")
            continue

# Example: Collect data for years 2020-2024
# collect_multiple_years(2020, 2024, max_tickers=100)  # Test with 100 tickers per year
# collect_multiple_years(2020, 2024)  # Full collection


## Collect Several Years at Once

In [None]:
collect_multiple_years(2015, 2019, max_tickers=100)

## Individual Year Collection Cells

Run these cells one by one to collect data for each year. Each cell is independent.


In [None]:
# Collect 2024 data
YEAR = 2024
MAX_TICKERS = None  # Set to a number like 100 to test with fewer tickers

collect_and_save_year(YEAR, max_tickers=MAX_TICKERS)


In [None]:
# Collect 2023 data
YEAR = 2023
MAX_TICKERS = None

collect_and_save_year(YEAR, max_tickers=MAX_TICKERS)


In [None]:
# Collect 2022 data
YEAR = 2022
MAX_TICKERS = None

collect_and_save_year(YEAR, max_tickers=MAX_TICKERS)


In [None]:
# Collect 2021 data
YEAR = 2021
MAX_TICKERS = None

collect_and_save_year(YEAR, max_tickers=MAX_TICKERS)


In [None]:
# Collect 2020 data
YEAR = 2020
MAX_TICKERS = None

collect_and_save_year(YEAR, max_tickers=MAX_TICKERS)


In [None]:
# Collect 2019 data
YEAR = 2019
MAX_TICKERS = None

collect_and_save_year(YEAR, max_tickers=MAX_TICKERS)

In [None]:
# Collect 2018 data
YEAR = 2018
MAX_TICKERS = None

collect_and_save_year(YEAR, max_tickers=MAX_TICKERS)

git_push_year_data(YEAR)

In [None]:
# Collect 2017 data
YEAR = 2017
MAX_TICKERS = None

collect_and_save_year(YEAR, max_tickers=MAX_TICKERS)

In [None]:
# Collect 2016 data
YEAR = 2016
MAX_TICKERS = None

collect_and_save_year(YEAR, max_tickers=MAX_TICKERS)

In [None]:
# Collect 2015 data
YEAR = 2015
MAX_TICKERS = None

collect_and_save_year(YEAR, max_tickers=MAX_TICKERS)