In [2]:
import pandas as pd

def fetch_sp500_tickers():
    # Scrape the list of S&P 500 constituents from Wikipedia
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    tables = pd.read_html(url)
    df = tables[0]   # first table on the page
    tickers = df['Symbol'].tolist()
    return tickers



# Example usage:
sp500 = fetch_sp500_tickers()


print(f"Loaded {len(sp500)} tickers; sample:", sp500[:10])


Loaded 502 tickers; sample: ['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A']


In [None]:
import pandas as pd

def fetch_nasdaq100_tickers():
    url = "https://en.wikipedia.org/wiki/Nasdaq-100"
    tables = pd.read_html(url)
    
    # 1) Loop through each table and inspect its column names
    for i, df in enumerate(tables):
        # Safely lowercase every column name by first converting to str
        lower_cols = [str(col).lower() for col in df.columns]
        
        if "ticker" in lower_cols or "symbol" in lower_cols:
            # pick the actual column name matching ticker/symbol
            orig_col = df.columns[lower_cols.index("ticker")] if "ticker" in lower_cols \
                       else df.columns[lower_cols.index("symbol")]
            print(f"‚Üí Using table #{i} with column '{orig_col}'")
            # strip whitespace and return as a list
            return df[orig_col].astype(str).str.strip().tolist()
    
    # If we exit the loop without returning, no suitable column was found
    raise ValueError("Could not find a 'Ticker' or 'Symbol' column on that page.")

# Run it and see:
nasdaq100 = fetch_nasdaq100_tickers()
print(f"Found {len(nasdaq100)} Nasdaq-100 tickers; sample: {nasdaq100[:10]}")


‚Üí Using table #4 with column 'Ticker'
Found 101 Nasdaq-100 tickers; sample: ['ADBE', 'AMD', 'ABNB', 'GOOGL', 'GOOG', 'AMZN', 'AEP', 'AMGN', 'ADI', 'ANSS']


In [7]:
import pandas as pd

def fetch_tickers_from_wikipedia(url: str) -> list:
    """
    Fetches the first column named 'Ticker' or 'Symbol' from any Wikipedia table on `url`.
    Strips whitespace and normalizes dots to dashes (for yfinance compatibility).
    """
    tables = pd.read_html(url)
    for df in tables:
        # lowercase colnames for matching
        lower = [str(c).lower() for c in df.columns]
        for key in ("ticker", "symbol"):
            if key in lower:
                orig = df.columns[lower.index(key)]
                # normalize and return
                return (
                    df[orig]
                    .astype(str)
                    .str.strip()
                    .str.replace(r"\.", "-", regex=True)
                    .tolist()
                )
    raise ValueError(f"No 'Ticker' or 'Symbol' column found at {url}")

def fetch_sp500_tickers() -> list:
    return fetch_tickers_from_wikipedia(
        "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    )

def fetch_nasdaq100_tickers() -> list:
    return fetch_tickers_from_wikipedia(
        "https://en.wikipedia.org/wiki/Nasdaq-100"
    )

def fetch_dow30_tickers() -> list:
    return fetch_tickers_from_wikipedia(
        "https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average"
    )

def fetch_russell2000_tickers() -> list:
    return fetch_tickers_from_wikipedia(
        "https://en.wikipedia.org/wiki/Russell_2000_Index"
    )

# Example usage
if __name__ == "__main__":
    sp500   = fetch_sp500_tickers()
    nas100  = fetch_nasdaq100_tickers()
    dow30   = fetch_dow30_tickers()
    russell = fetch_russell2000_tickers()

    print(f"S&P 500:    {len(sp500)} tickers, sample {sp500[:5]}")
    print(f"Nasdaq-100: {len(nas100)} tickers, sample {nas100[:5]}")
    print(f"Dow 30:     {len(dow30)} tickers, sample {dow30[:5]}")
    print(f"Russell 2000: {len(russell)} tickers, sample {russell[:5]}")


S&P 500:    502 tickers, sample ['MMM', 'AOS', 'ABT', 'ABBV', 'ACN']
Nasdaq-100: 101 tickers, sample ['ADBE', 'AMD', 'ABNB', 'GOOGL', 'GOOG']
Dow 30:     30 tickers, sample ['MMM', 'AXP', 'AMGN', 'AMZN', 'AAPL']
Russell 2000: 11 tickers, sample ['ADTN', 'ALIT', 'CHX', 'GTLS', 'DNLI']


In [8]:
import pandas as pd
nasdaq100_df = pd.DataFrame(nas100, columns=["Ticker"])
snp_500_df=pd.DataFrame(sp500, columns=["Ticker"])
russell_df=pd.DataFrame(russell, columns=["Ticker"])
dow30_df= pd.DataFrame(dow30, columns=["Ticker"])


In [10]:
merged_tickers = pd.concat([snp_500_df,nasdaq100_df,russell_df,dow30_df], axis=0, ignore_index=True).reset_index(drop=True)

merged_tickers_unique = merged_tickers.drop_duplicates().reset_index(drop=True)
merged_tickers_unique.shape

(528, 1)

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
from typing import List, Dict, Optional

def build_universe(raw_tickers: List[str], 
                   lookback_days: int = 90,
                   min_avg_vol: float = 500_000, 
                   min_mktcap: float = 1e9,
                   min_price: float = 5.0,
                   max_per_sector: int = 30,
                   min_data_coverage: float = 0.8,
                   verbose: bool = True) -> pd.DataFrame:
    """
    Build a filtered stock universe from raw tickers based on liquidity, 
    market cap, price, and sector diversification criteria.
    
    Parameters:
    -----------
    raw_tickers : List[str]
        List of ticker symbols to filter
    lookback_days : int
        Number of days to look back for volume calculations
    min_avg_vol : float
        Minimum average daily volume
    min_mktcap : float
        Minimum market capitalization
    min_price : float
        Minimum stock price
    max_per_sector : int
        Maximum stocks per sector
    min_data_coverage : float
        Minimum fraction of data points required (0.8 = 80%)
    verbose : bool
        Print progress information
        
    Returns:
    --------
    pd.DataFrame
        Filtered universe with ticker, sector, market cap, price, and volume data
    """
    
    if verbose:
        print(f"Building universe from {len(raw_tickers)} tickers...")
    
    # 1) Download data with error handling
    end = datetime.today()
    start = end - timedelta(days=int(lookback_days * 1.3))
    
    try:
        data = yf.download(
            raw_tickers,
            start=start.strftime("%Y-%m-%d"),
            end=end.strftime("%Y-%m-%d"),
            auto_adjust=True,
            group_by="ticker",
            threads=True,
            progress=verbose
        )
    except Exception as e:
        print(f"Error downloading data: {e}")
        return pd.DataFrame()
    
    # Handle single ticker case
    if len(raw_tickers) == 1:
        ticker = raw_tickers[0]
        data = {ticker: data}
    
    # 2) Extract adjusted price & volume with data quality checks
    prices = pd.DataFrame()
    volumes = pd.DataFrame()
    
    failed_tickers = []
    for ticker in raw_tickers:
        try:
            if ticker in data and not data[ticker].empty:
                # Check data coverage
                ticker_data = data[ticker]
                coverage = ticker_data['Close'].notna().sum() / len(ticker_data)
                
                if coverage >= min_data_coverage:
                    prices[ticker] = ticker_data['Close']
                    volumes[ticker] = ticker_data['Volume']
                else:
                    failed_tickers.append(f"{ticker} (coverage: {coverage:.1%})")
            else:
                failed_tickers.append(f"{ticker} (no data)")
        except Exception as e:
            failed_tickers.append(f"{ticker} (error: {str(e)[:50]})")
    
    if failed_tickers and verbose:
        print(f"Failed to process {len(failed_tickers)} tickers: {failed_tickers[:5]}...")
    
    if prices.empty:
        print("No valid price data found!")
        return pd.DataFrame()
    
    # 3) Compute stats with error handling
    try:
        avg_vol = volumes.rolling(window=lookback_days, min_periods=int(lookback_days * 0.7)).mean().iloc[-1]
        last_price = prices.iloc[-1]
        
        # Calculate additional metrics
        price_volatility = prices.pct_change().rolling(window=lookback_days).std().iloc[-1] * np.sqrt(252)
        
    except Exception as e:
        print(f"Error computing statistics: {e}")
        return pd.DataFrame()
    
    # 4) Get market-cap & sector info with batch processing
    if verbose:
        print("Fetching company information...")
    
    infos = {}
    valid_tickers = prices.columns.tolist()
    
    for i, ticker in enumerate(valid_tickers):
        try:
            info = yf.Ticker(ticker).info
            infos[ticker] = {
                "marketCap": info.get("marketCap", np.nan),
                "sector": info.get("sector", "Unknown"),
                "industry": info.get("industry", "Unknown"),
                "country": info.get("country", "Unknown")
            }
            
            if verbose and (i + 1) % 20 == 0:
                print(f"  Processed {i + 1}/{len(valid_tickers)} tickers")
                
        except Exception as e:
            infos[ticker] = {
                "marketCap": np.nan,
                "sector": "Unknown",
                "industry": "Unknown", 
                "country": "Unknown"
            }
    
    info_df = pd.DataFrame.from_dict(infos, orient="index")
    
    # 5) Assemble and filter
    df = pd.DataFrame({
        "avg_vol": avg_vol,
        "price": last_price,
        "volatility": price_volatility
    }).join(info_df)
    
    # Apply filters
    initial_count = len(df)
    
    # Volume filter
    vol_filter = df["avg_vol"] >= min_avg_vol
    df = df[vol_filter]
    if verbose:
        print(f"After volume filter (>={min_avg_vol:,.0f}): {len(df)} stocks ({initial_count - len(df)} removed)")
    
    # Market cap filter
    mktcap_filter = df["marketCap"] >= min_mktcap
    df = df[mktcap_filter]
    if verbose:
        print(f"After market cap filter (>=${min_mktcap/1e9:.1f}B): {len(df)} stocks")
    
    # Price filter
    price_filter = df["price"] >= min_price
    df = df[price_filter]
    if verbose:
        print(f"After price filter (>=${min_price}): {len(df)} stocks")
    
    # Remove stocks with missing key data
    complete_data_filter = df[["marketCap", "avg_vol", "price"]].notna().all(axis=1)
    df = df[complete_data_filter]
    if verbose:
        print(f"After data completeness filter: {len(df)} stocks")
    
    if df.empty:
        print("No stocks passed all filters!")
        return df
    
    # 6) Cap each sector to top N by market-cap
    if verbose:
        print(f"Applying sector caps ({max_per_sector} per sector)...")
    
    universe = (
        df
        .reset_index()
        .rename(columns={"index": "ticker"})
        .groupby("sector", group_keys=False)
        .apply(lambda g: g.nlargest(max_per_sector, "marketCap"))
        .set_index("ticker")
        .sort_values("marketCap", ascending=False)
    )
    
    if verbose:
        print(f"\nFinal universe: {len(universe)} stocks")
        print(f"Sector distribution:")
        sector_counts = universe["sector"].value_counts()
        for sector, count in sector_counts.items():
            print(f"  {sector}: {count}")
    
    return universe

def analyze_universe(universe: pd.DataFrame) -> Dict:
    """
    Analyze the characteristics of the filtered universe.
    
    Parameters:
    -----------
    universe : pd.DataFrame
        The filtered stock universe
        
    Returns:
    --------
    Dict
        Analysis results including sector distribution, size distribution, etc.
    """
    if universe.empty:
        return {}
    
    analysis = {
        "total_stocks": len(universe),
        "total_market_cap": universe["marketCap"].sum(),
        "median_market_cap": universe["marketCap"].median(),
        "median_price": universe["price"].median(),
        "median_volume": universe["avg_vol"].median(),
        "sector_distribution": universe["sector"].value_counts().to_dict(),
        "size_distribution": {
            "Large Cap (>10B)": (universe["marketCap"] > 10e9).sum(),
            "Mid Cap (1B-10B)": ((universe["marketCap"] >= 1e9) & (universe["marketCap"] <= 10e9)).sum(),
            "Small Cap (<1B)": (universe["marketCap"] < 1e9).sum()
        }
    }
    
    return analysis

# Example usage with error handling
if __name__ == "__main__":
    # Example NASDAQ 100 tickers (partial list for demo)
    # nasdaq_sample = [
    #     'AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA', 'TSLA', 'META', 'AVGO', 'COST', 'NFLX',
    #     'ADBE', 'PEP', 'TMUS', 'CSCO', 'CMCSA', 'INTC', 'TXN', 'QCOM', 'AMGN', 'INTU'
    # ]
    
    try:
        # Build universe
        universe = build_universe(
            merged_tickers_unique['Ticker'].to_list(),
            lookback_days=90,
            min_avg_vol=1_000_000,
            min_mktcap=5e9,
            min_price=10.0,
            max_per_sector=15,
            verbose=True
        )
        
        if not universe.empty:
            # Analyze results
            analysis = analyze_universe(universe)
            
            print(f"\n=== UNIVERSE ANALYSIS ===")
            print(f"Total stocks: {analysis['total_stocks']}")
            print(f"Total market cap: ${analysis['total_market_cap']/1e12:.1f}T")
            print(f"Median market cap: ${analysis['median_market_cap']/1e9:.1f}B")
            print(f"Median price: ${analysis['median_price']:.2f}")
            print(f"Median daily volume: {analysis['median_volume']:,.0f}")
            
            print("\nTop 10 stocks by market cap:")
            print(universe.head(10)[['sector', 'marketCap', 'price', 'avg_vol']].round(2))
            
        else:
            print("No stocks in final universe!")
            
    except Exception as e:
        print(f"Error in main execution: {e}")

Building universe from 517 tickers...


[*********************100%***********************]  517 of 517 completed

2 Failed downloads:
['BRK.B']: YFTzMissingError('possibly delisted; no timezone found')
['BF.B']: YFPricesMissingError('possibly delisted; no price data found  (1d 2025-03-11 -> 2025-07-06)')
  prices[ticker] = ticker_data['Close']
  volumes[ticker] = ticker_data['Volume']
  prices[ticker] = ticker_data['Close']
  volumes[ticker] = ticker_data['Volume']
  prices[ticker] = ticker_data['Close']
  volumes[ticker] = ticker_data['Volume']
  prices[ticker] = ticker_data['Close']
  volumes[ticker] = ticker_data['Volume']
  prices[ticker] = ticker_data['Close']
  volumes[ticker] = ticker_data['Volume']
  prices[ticker] = ticker_data['Close']
  volumes[ticker] = ticker_data['Volume']
  prices[ticker] = ticker_data['Close']
  volumes[ticker] = ticker_data['Volume']
  prices[ticker] = ticker_data['Close']
  volumes[ticker] = ticker_data['Volume']
  prices[ticker] = ticker_data['Close']
  volumes[ticker] = ticker_data['Volum

Failed to process 2 tickers: ['BRK.B (coverage: 0.0%)', 'BF.B (coverage: 0.0%)']...
Fetching company information...
  Processed 20/515 tickers
  Processed 40/515 tickers
  Processed 60/515 tickers
  Processed 80/515 tickers
  Processed 100/515 tickers
  Processed 120/515 tickers
  Processed 140/515 tickers
  Processed 160/515 tickers
  Processed 180/515 tickers
  Processed 200/515 tickers
  Processed 220/515 tickers
  Processed 240/515 tickers
  Processed 260/515 tickers
  Processed 280/515 tickers
  Processed 300/515 tickers
  Processed 320/515 tickers
  Processed 340/515 tickers
  Processed 360/515 tickers
  Processed 380/515 tickers
  Processed 400/515 tickers
  Processed 420/515 tickers
  Processed 440/515 tickers
  Processed 460/515 tickers
  Processed 480/515 tickers
  Processed 500/515 tickers
After volume filter (>=1,000,000): 433 stocks (82 removed)
After market cap filter (>=$5.0B): 433 stocks
After price filter (>=$10.0): 431 stocks
After data completeness filter: 431 stocks

  .apply(lambda g: g.nlargest(max_per_sector, "marketCap"))


In [None]:
universe.head()

Unnamed: 0_level_0,avg_vol,price,volatility,marketCap,sector,industry,country
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NVDA,249626900.0,159.339996,,3885920157696,Technology,Semiconductors,United States
MSFT,22865490.0,498.839996,,3707648344064,Technology,Software - Infrastructure,United States
AAPL,61097840.0,213.550003,,3189540126720,Technology,Consumer Electronics,United States
AMZN,48410770.0,223.410004,,2371809968128,Consumer Cyclical,Internet Retail,United States
GOOGL,40436790.0,179.529999,,2184126005248,Communication Services,Internet Content & Information,United States


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple

def analyze_concentration(universe: pd.DataFrame, top_n: int = 10) -> Dict:
    """
    Analyze concentration risk in the universe.
    
    Parameters:
    -----------
    universe : pd.DataFrame
        The filtered stock universe
    top_n : int
        Number of top stocks to analyze
        
    Returns:
    --------
    Dict
        Concentration metrics
    """
    total_mcap = universe['marketCap'].sum()
    universe_sorted = universe.sort_values('marketCap', ascending=False)
    
    # Calculate cumulative percentages
    cumulative_pct = universe_sorted['marketCap'].cumsum() / total_mcap
    
    concentration_metrics = {
        'total_market_cap': total_mcap,
        'top_1_pct': cumulative_pct.iloc[0],
        'top_3_pct': cumulative_pct.iloc[2] if len(cumulative_pct) >= 3 else cumulative_pct.iloc[-1],
        'top_5_pct': cumulative_pct.iloc[4] if len(cumulative_pct) >= 5 else cumulative_pct.iloc[-1],
        'top_10_pct': cumulative_pct.iloc[9] if len(cumulative_pct) >= 10 else cumulative_pct.iloc[-1],
        'herfindahl_index': ((universe['marketCap'] / total_mcap) ** 2).sum()
    }
    
    return concentration_metrics

def suggest_rebalancing(universe: pd.DataFrame, target_sector_weights: Dict = None) -> pd.DataFrame:
    """
    Suggest rebalancing to achieve target sector weights.
    
    Parameters:
    -----------
    universe : pd.DataFrame
        The filtered stock universe
    target_sector_weights : Dict
        Target weights by sector (if None, uses equal weight)
        
    Returns:
    --------
    pd.DataFrame
        Rebalancing suggestions
    """
    if target_sector_weights is None:
        # Equal weight across sectors
        unique_sectors = universe['sector'].unique()
        target_sector_weights = {sector: 1.0/len(unique_sectors) for sector in unique_sectors}
    
    current_weights = universe.groupby('sector')['marketCap'].sum()
    current_weights = current_weights / current_weights.sum()
    
    rebalancing = pd.DataFrame({
        'current_weight': current_weights,
        'target_weight': pd.Series(target_sector_weights),
        'difference': pd.Series(target_sector_weights) - current_weights
    }).fillna(0)
    
    rebalancing['action'] = rebalancing['difference'].apply(
        lambda x: 'INCREASE' if x > 0.02 else ('DECREASE' if x < -0.02 else 'HOLD')
    )
    
    return rebalancing.sort_values('difference', ascending=False)

def calculate_risk_metrics(universe: pd.DataFrame, prices_df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate risk metrics for each stock in the universe.
    
    Parameters:
    -----------
    universe : pd.DataFrame
        The filtered stock universe
    prices_df : pd.DataFrame
        Historical prices (from your original data)
        
    Returns:
    --------
    pd.DataFrame
        Risk metrics for each stock
    """
    risk_metrics = []
    
    for ticker in universe.index:
        if ticker in prices_df.columns:
            prices = prices_df[ticker].dropna()
            returns = prices.pct_change().dropna()
            
            if len(returns) > 30:  # Minimum data requirement
                volatility = returns.std() * np.sqrt(252)  # Annualized
                max_drawdown = calculate_max_drawdown(prices)
                sharpe_ratio = returns.mean() / returns.std() * np.sqrt(252) if returns.std() > 0 else 0
                
                risk_metrics.append({
                    'ticker': ticker,
                    'volatility': volatility,
                    'max_drawdown': max_drawdown,
                    'sharpe_ratio': sharpe_ratio,
                    'beta': calculate_beta(returns, prices_df)  # vs market proxy
                })
    
    return pd.DataFrame(risk_metrics).set_index('ticker')

def calculate_max_drawdown(prices: pd.Series) -> float:
    """Calculate maximum drawdown for a price series."""
    peak = prices.expanding().max()
    drawdown = (prices - peak) / peak
    return drawdown.min()

def calculate_beta(stock_returns: pd.Series, market_prices: pd.DataFrame) -> float:
    """Calculate beta vs market (using average of all stocks as market proxy)."""
    try:
        market_returns = market_prices.mean(axis=1).pct_change().dropna()
        
        # Align dates
        common_dates = stock_returns.index.intersection(market_returns.index)
        stock_aligned = stock_returns.loc[common_dates]
        market_aligned = market_returns.loc[common_dates]
        
        if len(stock_aligned) > 30:
            covariance = np.cov(stock_aligned, market_aligned)[0, 1]
            market_variance = np.var(market_aligned)
            return covariance / market_variance if market_variance > 0 else 1.0
        else:
            return 1.0
    except:
        return 1.0

def optimize_sector_allocation(universe: pd.DataFrame, max_sector_weight: float = 0.4) -> pd.DataFrame:
    """
    Optimize sector allocation to reduce concentration risk.
    
    Parameters:
    -----------
    universe : pd.DataFrame
        The filtered stock universe
    max_sector_weight : float
        Maximum weight for any single sector
        
    Returns:
    --------
    pd.DataFrame
        Optimized allocation suggestions
    """
    # Current sector weights
    sector_weights = universe.groupby('sector')['marketCap'].sum()
    sector_weights = sector_weights / sector_weights.sum()
    
    # Identify over-allocated sectors
    over_allocated = sector_weights[sector_weights > max_sector_weight]
    
    if len(over_allocated) == 0:
        return pd.DataFrame({'message': ['No rebalancing needed']})
    
    # Calculate adjustments needed
    adjustments = []
    for sector, weight in over_allocated.items():
        excess = weight - max_sector_weight
        sector_stocks = universe[universe['sector'] == sector].sort_values('marketCap', ascending=False)
        
        # Suggest reducing positions in largest stocks
        cumulative_excess = 0
        for ticker, row in sector_stocks.iterrows():
            if cumulative_excess < excess:
                stock_weight = row['marketCap'] / universe['marketCap'].sum()
                reduction = min(stock_weight * 0.5, excess - cumulative_excess)  # Reduce by up to 50%
                
                adjustments.append({
                    'ticker': ticker,
                    'sector': sector,
                    'current_weight': stock_weight,
                    'suggested_reduction': reduction,
                    'action': 'REDUCE'
                })
                cumulative_excess += reduction
    
    return pd.DataFrame(adjustments)

def create_universe_dashboard(universe: pd.DataFrame, analysis: Dict) -> str:
    """
    Create a comprehensive dashboard summary.
    
    Parameters:
    -----------
    universe : pd.DataFrame
        The filtered stock universe
    analysis : Dict
        Analysis results from analyze_universe()
        
    Returns:
    --------
    str
        Formatted dashboard text
    """
    dashboard = f"""
=== UNIVERSE DASHBOARD ===

üìä PORTFOLIO OVERVIEW
Total Stocks: {analysis['total_stocks']}
Total Market Cap: ${analysis['total_market_cap']/1e12:.1f}T
Median Market Cap: ${analysis['median_market_cap']/1e9:.1f}B
Median Price: ${analysis['median_price']:.2f}
Median Daily Volume: {analysis['median_volume']:,.0f}

üè≠ SECTOR BREAKDOWN
"""
    
    for sector, count in analysis['sector_distribution'].items():
        pct = count / analysis['total_stocks'] * 100
        dashboard += f"{sector}: {count} stocks ({pct:.1f}%)\n"
    
    dashboard += f"""
üìà SIZE DISTRIBUTION
Large Cap (>$10B): {analysis['size_distribution']['Large Cap (>10B)']} stocks
Mid Cap ($1B-$10B): {analysis['size_distribution']['Mid Cap (1B-10B)']} stocks
Small Cap (<$1B): {analysis['size_distribution']['Small Cap (<1B)']} stocks

‚ö†Ô∏è  CONCENTRATION ANALYSIS
"""
    
    # Add concentration metrics
    concentration = analyze_concentration(universe)
    dashboard += f"Top 1 Stock: {concentration['top_1_pct']:.1%} of total market cap\n"
    dashboard += f"Top 3 Stocks: {concentration['top_3_pct']:.1%} of total market cap\n"
    dashboard += f"Top 5 Stocks: {concentration['top_5_pct']:.1%} of total market cap\n"
    dashboard += f"Herfindahl Index: {concentration['herfindahl_index']:.3f} (lower = more diversified)\n"
    
    # Risk assessment
    if concentration['top_3_pct'] > 0.5:
        dashboard += "\nüî¥ HIGH CONCENTRATION RISK: Top 3 stocks >50% of portfolio\n"
    elif concentration['top_3_pct'] > 0.3:
        dashboard += "\nüü° MODERATE CONCENTRATION RISK: Top 3 stocks >30% of portfolio\n"
    else:
        dashboard += "\nüü¢ LOW CONCENTRATION RISK: Well diversified\n"
    
    dashboard += f"""
üéØ RECOMMENDATIONS
‚Ä¢ Consider sector caps of 25-30% to reduce tech concentration
‚Ä¢ Monitor liquidity: {(universe['avg_vol'] < 10_000_000).sum()} stocks have <10M daily volume
‚Ä¢ Review price impact: {(universe['price'] > 500).sum()} stocks are >$500/share
‚Ä¢ Rebalance frequency: Monthly given high correlation in mega-caps
"""
    
    return dashboard

# Example usage with your universe
def run_comprehensive_analysis(universe: pd.DataFrame) -> None:
    """
    Run a comprehensive analysis of the universe.
    
    Parameters:
    -----------
    universe : pd.DataFrame
        The filtered stock universe
    """
    print("Running comprehensive universe analysis...")
    
    # Basic analysis
    analysis = {
        "total_stocks": len(universe),
        "total_market_cap": universe["marketCap"].sum(),
        "median_market_cap": universe["marketCap"].median(),
        "median_price": universe["price"].median(),
        "median_volume": universe["avg_vol"].median(),
        "sector_distribution": universe["sector"].value_counts().to_dict(),
        "size_distribution": {
            "Large Cap (>10B)": (universe["marketCap"] > 10e9).sum(),
            "Mid Cap (1B-10B)": ((universe["marketCap"] >= 1e9) & (universe["marketCap"] <= 10e9)).sum(),
            "Small Cap (<1B)": (universe["marketCap"] < 1e9).sum()
        }
    }
    
    # Print dashboard
    dashboard = create_universe_dashboard(universe, analysis)
    print(dashboard)
    
    # Sector rebalancing suggestions
    print("\n=== SECTOR REBALANCING SUGGESTIONS ===")
    rebalancing = suggest_rebalancing(universe)
    print(rebalancing)
    
    # Concentration optimization
    print("\n=== CONCENTRATION OPTIMIZATION ===")
    optimization = optimize_sector_allocation(universe, max_sector_weight=0.3)
    if 'message' not in optimization.columns:
        print(optimization)
    else:
        print("‚úÖ Sector allocation within recommended limits")

# Run the analysis
if __name__ == "__main__":
    # Assuming 'universe' is your filtered universe DataFrame
    run_comprehensive_analysis(universe)
    pass

Running comprehensive universe analysis...

=== UNIVERSE DASHBOARD ===

üìä PORTFOLIO OVERVIEW
Total Stocks: 165
Total Market Cap: $45.5T
Median Market Cap: $112.8B
Median Price: $143.18
Median Daily Volume: 5,043,321

üè≠ SECTOR BREAKDOWN
Technology: 15 stocks (9.1%)
Consumer Cyclical: 15 stocks (9.1%)
Communication Services: 15 stocks (9.1%)
Financial Services: 15 stocks (9.1%)
Consumer Defensive: 15 stocks (9.1%)
Healthcare: 15 stocks (9.1%)
Energy: 15 stocks (9.1%)
Industrials: 15 stocks (9.1%)
Basic Materials: 15 stocks (9.1%)
Utilities: 15 stocks (9.1%)
Real Estate: 15 stocks (9.1%)

üìà SIZE DISTRIBUTION
Large Cap (>$10B): 165 stocks
Mid Cap ($1B-$10B): 0 stocks
Small Cap (<$1B): 0 stocks

‚ö†Ô∏è  CONCENTRATION ANALYSIS
Top 1 Stock: 8.5% of total market cap
Top 3 Stocks: 23.7% of total market cap
Top 5 Stocks: 33.7% of total market cap
Herfindahl Index: 0.033 (lower = more diversified)

üü¢ LOW CONCENTRATION RISK: Well diversified

üéØ RECOMMENDATIONS
‚Ä¢ Consider sector ca

In [None]:
universe['sector'].unique()

array(['Technology', 'Consumer Cyclical', 'Communication Services',
       'Financial Services', 'Consumer Defensive', 'Healthcare', 'Energy',
       'Industrials', 'Basic Materials', 'Utilities', 'Real Estate'],
      dtype=object)

In [None]:
universe.to_csv("ticker_universe.csv", index=False)


In [None]:
import os
print(os.listdir("."))        # should list ticker_universe.csv

# And to load it later:
import pandas as pd
df = pd.read_csv("ticker_universe.csv", index_col=0)
df.head()


['calculate_summary.ipynb', 'calculate_trades.ipynb', 'data_prep.ipynb', 'gdrive-creds.json', 'gspread.ipynb', 'ibkr.ipynb', 'ml_train.ipynb', 'stocks_df_combined_2025_07_03.parquet.brotli', 'ticker_universe.csv']


Unnamed: 0_level_0,price,volatility,marketCap,sector,industry,country
avg_vol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
249626900.0,159.339996,,3885920157696,Technology,Semiconductors,United States
22865490.0,498.839996,,3707648344064,Technology,Software - Infrastructure,United States
61097840.0,213.550003,,3189540126720,Technology,Consumer Electronics,United States
48410770.0,223.410004,,2371809968128,Consumer Cyclical,Internet Retail,United States
40436790.0,179.529999,,2184126005248,Communication Services,Internet Content & Information,United States


In [None]:
def build_universe_tiers(raw_tickers, lookback_days=90, verbose=True):
    # 1) run your existing build_universe with loose liquidity/cap
    base = build_universe(
        raw_tickers,
        lookback_days=lookback_days,
        min_avg_vol=200_000,     # much lower
        min_mktcap=250e6,        # down to $250M
        min_price=1.0,           # penny‚Äêstock guard
        max_per_sector=100,      # very high cap so tiering later
        min_data_coverage=0.8,
        verbose=verbose
    )

    # 2) split into Large / Mid / Small by marketCap
    large = base[base.marketCap >= 10e9]
    mid   = base[(base.marketCap >= 2e9) & (base.marketCap < 10e9)]
    small = base[(base.marketCap < 2e9)]

    # 3) optionally re-apply liquidity floor per tier
    mid = mid[mid.avg_vol >= 500_000]
    small = small[small.avg_vol >= 300_000]

    # 4) recombine and return
    return pd.concat([large, mid, small]).sort_values('marketCap', ascending=False)


In [1]:
def cap_by_sector(df, cap_base=15, scale_factor=0.5):
    counts = df.sector.value_counts()
    def cap(n):
        # e.g. large sectors get 1√ócap, small ones 0.5√ócap
        weight = min(1, counts[n]/counts.max())
        return int(cap_base * (scale_factor + 0.5*weight))
    return (
      df.reset_index()
        .groupby('sector', group_keys=False)
        .apply(lambda g: g.nlargest(cap(g.name), 'marketCap'))
        .set_index('ticker')
    )


Expand Your Raw Ticker List
If you started only with S&P 500 or Nasdaq-100 tickers, you can broaden to:

Russell 2000 (small-cap index)

Regional exchanges (e.g. TSX, LSE ADRs)

Sector-specific ETFs: take their holdings as a universe

ETF ‚Äúholders‚Äù: scrape top 50 names from popular sector ETFs

Example for Russell 2000 via yfinance (ticker = ^RUT doesn‚Äôt give constituents, but you can pull from an external CSV or package like investpy).

In [None]:
# 1) Fetch a broad raw list (e.g. your merged tickers plus Russell-2000 list)
raw = merged_tickers_unique['Ticker'].to_list() + russell2000_list

# 2) Build a tiered universe
broad_universe = build_universe_tiers(raw, lookback_days=90, verbose=True)

# 3) Cap by dynamic sector rules
broad_universe = cap_by_sector(broad_universe, cap_base=20, scale_factor=0.5)

# 4) Compute liquidity weights for downstream sizing
broad_universe['dollar_vol'] = broad_universe.price * broad_universe.avg_vol
dv = broad_universe.dollar_vol
broad_universe['liq_weight'] = (dv - dv.min())/(dv.max()-dv.min())
