# ETF Universe Collection & Update

**Purpose:** Collect comprehensive universe of 2000+ ETFs from multiple sources

**Run Frequency:** 
- Monthly for new ETF additions
- Weekly for price data updates

**Sources:**
1. ETF Database (etfdb.com) - comprehensive ETF list
2. Nasdaq listings - official exchange data
3. Comprehensive seed list - curated by category

**Process:**
1. Scrape ETF tickers from multiple sources
2. Merge and deduplicate
3. Filter (remove leveraged, low AUM, etc.)
4. Download price data in parallel (20 threads)
5. Validate data quality
6. Generate statistics and reports

In [None]:
# Setup and imports
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import warnings

warnings.filterwarnings('ignore')

# Dynamic project root detection
PROJECT_ROOT = Path().resolve()
while not (PROJECT_ROOT / 'requirements.txt').exists() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

sys.path.insert(0, str(PROJECT_ROOT))

print(f"Project Root: {PROJECT_ROOT}")
print(f"Python: {sys.version}")
print(f"Notebook executed: {datetime.now()}")

In [None]:
# Import custom modules
from src.data_collection.etf_universe_builder import ComprehensiveETFScraper, ParallelETFDownloader

# Define paths
DATA_DIR = PROJECT_ROOT / 'data' / 'raw'
PRICES_DIR = DATA_DIR / 'prices'
UNIVERSE_FILE = DATA_DIR / 'etf_universe.csv'
RESULTS_DIR = PROJECT_ROOT / 'results'

# Create directories
PRICES_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

print("âœ“ Modules imported successfully")
print(f"âœ“ Data directory: {DATA_DIR}")
print(f"âœ“ Prices directory: {PRICES_DIR}")

## Step 1: Scrape ETF Universe from Multiple Sources

In [None]:
# Initialize scraper
scraper = ComprehensiveETFScraper()

print("Collecting ETFs from multiple sources...\n")
print("=" * 80)

In [None]:
# Source 1: ETF Database (primary comprehensive source)
print("\n[1/3] Scraping ETF Database (etfdb.com)...")
print("-" * 80)

etfdb_etfs = scraper.scrape_etfdb_all_etfs(max_pages=50)

print(f"\nâœ“ ETF Database: {len(etfdb_etfs)} ETFs collected")
print(f"  Sample: {list(etfdb_etfs['ticker'].head(10))}")

In [None]:
# Source 2: Nasdaq listings
print("\n[2/3] Scraping Nasdaq ETF listings...")
print("-" * 80)

nasdaq_etfs = scraper.scrape_nasdaq_listings()

print(f"\nâœ“ Nasdaq: {len(nasdaq_etfs)} ETFs collected")
if len(nasdaq_etfs) > 0:
    print(f"  Sample: {list(nasdaq_etfs['ticker'].head(10))}")

In [None]:
# Source 3: Comprehensive seed list (backup/supplement)
print("\n[3/3] Loading comprehensive seed list...")
print("-" * 80)

seed_etfs = scraper._get_comprehensive_seed_list()

print(f"\nâœ“ Seed List: {len(seed_etfs)} ETFs")
print(f"  Categories: {seed_etfs['category'].nunique()}")
print(f"  Sample categories: {list(seed_etfs['category'].unique()[:5])}")

## Step 2: Merge and Deduplicate

In [None]:
print("\nMerging and deduplicating sources...")
print("=" * 80)

# Merge all sources
all_sources = [etfdb_etfs, nasdaq_etfs, seed_etfs]
merged_universe = scraper.merge_and_deduplicate(all_sources)

print(f"\nâœ“ Total unique ETFs after merge: {len(merged_universe)}")
print(f"\nBreakdown:")
print(f"  - ETF Database: {len(etfdb_etfs)}")
print(f"  - Nasdaq: {len(nasdaq_etfs)}")
print(f"  - Seed List: {len(seed_etfs)}")
print(f"  - Unique merged: {len(merged_universe)}")

# Preview
print(f"\nPreview of merged universe:")
display(merged_universe.head(10))

## Step 3: Filter Universe

In [None]:
print("\nFiltering ETF universe...")
print("=" * 80)
print("\nFilters applied:")
print("  - Remove leveraged/inverse ETFs")
print("  - Remove low AUM (<$10M)")
print("  - Clean ticker symbols")
print()

filtered_universe = scraper.filter_universe(
    merged_universe,
    min_aum=10e6,  # $10M minimum
    remove_leveraged=True
)

print(f"\nâœ“ Filtered universe: {len(filtered_universe)} ETFs")
print(f"  Removed: {len(merged_universe) - len(filtered_universe)} ETFs")

# Show category distribution if available
if 'category' in filtered_universe.columns:
    print(f"\nCategory distribution:")
    category_counts = filtered_universe['category'].value_counts().head(15)
    for cat, count in category_counts.items():
        print(f"  {cat:30s}: {count:4d} ETFs")

## Step 4: Parallel Price Data Download

**This will take 30-90 minutes depending on number of ETFs and network speed**

Progress will be shown every 50 ETFs.

In [None]:
# Initialize parallel downloader
downloader = ParallelETFDownloader(
    output_dir=PRICES_DIR,
    min_years=2.0,  # Require minimum 2 years of data
    max_workers=20,  # 20 parallel downloads
    max_retries=3
)

print("Parallel Downloader Configuration:")
print("=" * 80)
print(f"  Output directory: {PRICES_DIR}")
print(f"  Minimum data: {downloader.min_years} years ({downloader.min_days} days)")
print(f"  Parallel workers: {downloader.max_workers}")
print(f"  Max retries per ETF: {downloader.max_retries}")
print(f"\n  ETFs to download: {len(filtered_universe)}")
print(f"  Estimated time: {len(filtered_universe) / (downloader.max_workers * 2):.0f}-{len(filtered_universe) / downloader.max_workers:.0f} minutes")
print()

In [None]:
# Download price data
print("\nStarting parallel download...")
print("=" * 80)
print("(Progress updates every 50 ETFs)\n")

start_time = datetime.now()

# Get list of tickers
tickers = filtered_universe['ticker'].tolist()

# Download
download_results = downloader.download_batch(tickers)

end_time = datetime.now()
duration = (end_time - start_time).total_seconds() / 60

print(f"\nâœ“ Download completed in {duration:.1f} minutes")
print(f"  Rate: {len(tickers) / duration:.1f} ETFs/minute")

## Step 5: Analyze Results

In [None]:
# Download statistics
print("\nDownload Statistics:")
print("=" * 80)

total = len(download_results)
successful = download_results['success'].sum()
failed = total - successful
success_rate = (successful / total) * 100

print(f"  Total ETFs: {total}")
print(f"  Successful: {successful} ({success_rate:.1f}%)")
print(f"  Failed: {failed} ({100-success_rate:.1f}%)")

# Show failure reasons
if failed > 0:
    print(f"\nTop failure reasons:")
    failed_results = download_results[~download_results['success']]
    
    # Categorize failures
    failure_categories = {}
    for msg in failed_results['message']:
        if 'No data' in msg:
            failure_categories['No data returned'] = failure_categories.get('No data returned', 0) + 1
        elif 'Insufficient' in msg:
            failure_categories['Insufficient history'] = failure_categories.get('Insufficient history', 0) + 1
        elif 'missing' in msg.lower():
            failure_categories['Too much missing data'] = failure_categories.get('Too much missing data', 0) + 1
        else:
            failure_categories['Other errors'] = failure_categories.get('Other errors', 0) + 1
    
    for reason, count in sorted(failure_categories.items(), key=lambda x: -x[1]):
        print(f"  - {reason}: {count}")

In [None]:
# Update universe with download results
print("\nUpdating universe with download results...")

# Merge results
final_universe = filtered_universe.merge(
    download_results[['ticker', 'success', 'message']], 
    on='ticker', 
    how='left'
)

# Add collection date
final_universe['data_collection_date'] = datetime.now().strftime('%Y-%m-%d')

# Save complete universe
final_universe.to_csv(UNIVERSE_FILE, index=False)
print(f"âœ“ Saved universe to: {UNIVERSE_FILE}")

# Save download results
results_file = RESULTS_DIR / f"etf_download_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
download_results.to_csv(results_file, index=False)
print(f"âœ“ Saved download results to: {results_file}")

## Step 6: Data Quality Validation

In [None]:
# Validate downloaded data
print("\nValidating price data quality...")
print("=" * 80)

# Get successfully downloaded ETFs
successful_tickers = download_results[download_results['success']]['ticker'].tolist()

# Analyze a sample of files
validation_stats = []

for ticker in successful_tickers[:100]:  # Sample first 100
    file_path = PRICES_DIR / f"{ticker}.csv"
    
    if not file_path.exists():
        continue
    
    try:
        df = pd.read_csv(file_path, index_col=0, parse_dates=True)
        
        validation_stats.append({
            'ticker': ticker,
            'num_days': len(df),
            'start_date': df.index.min(),
            'end_date': df.index.max(),
            'missing_close_pct': df['Close'].isna().sum() / len(df) * 100,
            'missing_volume_pct': df['Volume'].isna().sum() / len(df) * 100
        })
    except Exception as e:
        continue

validation_df = pd.DataFrame(validation_stats)

if len(validation_df) > 0:
    print(f"\nValidation sample: {len(validation_df)} ETFs")
    print(f"\nData coverage:")
    print(f"  Average days of data: {validation_df['num_days'].mean():.0f}")
    print(f"  Min days: {validation_df['num_days'].min():.0f}")
    print(f"  Max days: {validation_df['num_days'].max():.0f}")
    print(f"  Average missing Close: {validation_df['missing_close_pct'].mean():.2f}%")
    print(f"  Average missing Volume: {validation_df['missing_volume_pct'].mean():.2f}%")
    
    print(f"\nDate range:")
    print(f"  Earliest start: {validation_df['start_date'].min()}")
    print(f"  Latest end: {validation_df['end_date'].max()}")

## Step 7: Final Summary & Visualizations

In [None]:
# Final summary
print("\n" + "=" * 80)
print("FINAL ETF UNIVERSE SUMMARY")
print("=" * 80)

print(f"\nCollection Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"\nETF Counts:")
print(f"  Total scraped: {len(merged_universe)}")
print(f"  After filtering: {len(filtered_universe)}")
print(f"  Successfully downloaded: {successful}")
print(f"  Failed downloads: {failed}")
print(f"  Success rate: {success_rate:.1f}%")

print(f"\nData Quality (sample of {len(validation_df)} ETFs):")
if len(validation_df) > 0:
    print(f"  Average history: {validation_df['num_days'].mean():.0f} days ({validation_df['num_days'].mean()/365:.1f} years)")
    print(f"  Data completeness: {100 - validation_df['missing_close_pct'].mean():.1f}%")

print(f"\nFiles saved:")
print(f"  Universe: {UNIVERSE_FILE}")
print(f"  Price data: {PRICES_DIR} ({successful} files)")
print(f"  Results: {results_file}")

print(f"\nâœ“ ETF Universe Collection Complete!")
print("=" * 80)

In [None]:
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Success/Failure pie chart
axes[0, 0].pie(
    [successful, failed],
    labels=['Successful', 'Failed'],
    autopct='%1.1f%%',
    colors=['#2ecc71', '#e74c3c'],
    startangle=90
)
axes[0, 0].set_title(f'Download Results ({total} ETFs)', fontsize=14, fontweight='bold')

# 2. Data history distribution
if len(validation_df) > 0:
    axes[0, 1].hist(validation_df['num_days'], bins=30, color='#3498db', edgecolor='black')
    axes[0, 1].set_xlabel('Days of Historical Data')
    axes[0, 1].set_ylabel('Number of ETFs')
    axes[0, 1].set_title('Data History Distribution', fontsize=14, fontweight='bold')
    axes[0, 1].axvline(730, color='red', linestyle='--', label='2 years')
    axes[0, 1].legend()

# 3. Category distribution (if available)
if 'category' in final_universe.columns:
    successful_etfs = final_universe[final_universe['success'] == True]
    if len(successful_etfs) > 0:
        category_counts = successful_etfs['category'].value_counts().head(10)
        axes[1, 0].barh(range(len(category_counts)), category_counts.values, color='#9b59b6')
        axes[1, 0].set_yticks(range(len(category_counts)))
        axes[1, 0].set_yticklabels(category_counts.index)
        axes[1, 0].set_xlabel('Number of ETFs')
        axes[1, 0].set_title('Top 10 Categories (Successful Downloads)', fontsize=14, fontweight='bold')
        axes[1, 0].invert_yaxis()

# 4. Missing data percentage
if len(validation_df) > 0:
    axes[1, 1].hist(validation_df['missing_close_pct'], bins=20, color='#e67e22', edgecolor='black')
    axes[1, 1].set_xlabel('Missing Data (%)')
    axes[1, 1].set_ylabel('Number of ETFs')
    axes[1, 1].set_title('Data Completeness Distribution', fontsize=14, fontweight='bold')
    axes[1, 1].axvline(5, color='red', linestyle='--', label='5% threshold')
    axes[1, 1].legend()

plt.tight_layout()
plt.savefig(RESULTS_DIR / f"etf_universe_summary_{datetime.now().strftime('%Y%m%d')}.png", dpi=300, bbox_inches='tight')
plt.show()

print(f"âœ“ Visualization saved to: {RESULTS_DIR}")

In [None]:
# Display successful ETFs sample
print("\nSample of successfully downloaded ETFs:")
successful_sample = final_universe[final_universe['success'] == True][['ticker', 'name', 'category', 'aum']].head(20)
display(successful_sample)

## Next Steps

**Universe is ready!** You can now:

1. âœ… **Proceed to Phase 3:** Portfolio Optimization with small sample (100 ETFs)
2. âœ… **Use full universe:** All successfully downloaded ETFs available
3. ðŸ”„ **Update monthly:** Re-run this notebook to add new ETFs
4. ðŸ”„ **Update prices weekly:** Re-run Step 4 only to refresh price data

**Files created:**
- `data/raw/etf_universe.csv` - Complete universe with metadata
- `data/raw/prices/*.csv` - Individual ETF price files
- `results/etf_download_results_*.csv` - Download log
- `results/etf_universe_summary_*.png` - Visualization