In [None]:
import yfinance as yf
import pandas as pd
import requests
from datetime import datetime

# Function to fetch S&P 500 tickers
def get_sp500_tickers():
    """
    Scrapes the S&P 500 tickers from Wikipedia.
    """
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    response = requests.get(url)
    if response.status_code == 200:
        tables = pd.read_html(response.text)
        sp500_tickers = tables[0]['Symbol'].tolist()
        return sp500_tickers
    else:
        print("Failed to retrieve S&P 500 tickers.")
        return []

# Get all S&P 500 tickers
tickers = get_sp500_tickers()

# Limit to 2000 tickers (If necessary, expand to other indices or random selections)
if len(tickers) > 2000:
    tickers = tickers[:2000]

# Set the correct time range
start_date = "2024-12-25"
end_date = "2025-02-09"
interval = "1d"  # Daily data

# Function to fetch stock data
def get_all_stock_data(tickers, start_date, end_date, interval):
    """
    Fetches stock data for multiple companies and combines them into a single DataFrame.
    """
    all_data = []
    
    for ticker in tickers:
        print(f"Fetching data for {ticker}...")
        try:
            stock = yf.Ticker(ticker)
            data = stock.history(start=start_date, end=end_date, interval=interval)
            
            if not data.empty:
                data["Ticker"] = ticker  # Add a column to identify the stock
                all_data.append(data)
            else:
                print(f"No data found for {ticker}")
        except Exception as e:
            print(f"Error fetching data for {ticker}: {e}")

    # Combine all data into one DataFrame
    if all_data:
        combined_df = pd.concat(all_data, axis=0)
        return combined_df
    else:
        return None

# Fetch stock data
df = get_all_stock_data(tickers, start_date, end_date, interval)

# Save to CSV if data is available
if df is not None:
    filename = "sp500_stocks_custom_range67.csv"
    df.to_csv(filename)
    print(f"All stock data saved as {filename}")
    print(df.head())  # Display first few rows
else:
    print("No data retrieved. Check tickers or date range.")


  tables = pd.read_html(response.text)


Fetching data for MMM...
Fetching data for AOS...
Fetching data for ABT...
Fetching data for ABBV...
Fetching data for ACN...
Fetching data for ADBE...
Fetching data for AMD...
Fetching data for AES...
Fetching data for AFL...
Fetching data for A...
Fetching data for APD...
Fetching data for ABNB...
Fetching data for AKAM...
Fetching data for ALB...
Fetching data for ARE...
Fetching data for ALGN...
Fetching data for ALLE...
Fetching data for LNT...
Fetching data for ALL...
Fetching data for GOOGL...
Fetching data for GOOG...
Fetching data for MO...
Fetching data for AMZN...
Fetching data for AMCR...
Fetching data for AEE...
Fetching data for AEP...
Fetching data for AXP...
Fetching data for AIG...
Fetching data for AMT...
Fetching data for AWK...
Fetching data for AMP...
Fetching data for AME...
Fetching data for AMGN...
Fetching data for APH...
Fetching data for ADI...
Fetching data for ANSS...
Fetching data for AON...
Fetching data for APA...
Fetching data for APO...
Fetching data f

$BRK.B: possibly delisted; no timezone found


No data found for BRK.B
Fetching data for BBY...
Fetching data for TECH...
Fetching data for BIIB...
Fetching data for BLK...
Fetching data for BX...
Fetching data for BK...
Fetching data for BA...
Fetching data for BKNG...
Fetching data for BWA...
Fetching data for BSX...
Fetching data for BMY...
Fetching data for AVGO...
Fetching data for BR...
Fetching data for BRO...
Fetching data for BF.B...


$BF.B: possibly delisted; no price data found  (1d 2024-12-25 -> 2025-02-09)


No data found for BF.B
Fetching data for BLDR...
Fetching data for BG...
Fetching data for BXP...
Fetching data for CHRW...
Fetching data for CDNS...
Fetching data for CZR...
Fetching data for CPT...
Fetching data for CPB...
Fetching data for COF...
Fetching data for CAH...
Fetching data for KMX...
Fetching data for CCL...
Fetching data for CARR...
Fetching data for CAT...
Fetching data for CBOE...
Fetching data for CBRE...
Fetching data for CDW...
Fetching data for CE...
Fetching data for COR...
Fetching data for CNC...
Fetching data for CNP...
Fetching data for CF...
Fetching data for CRL...
Fetching data for SCHW...
Fetching data for CHTR...
Fetching data for CVX...
Fetching data for CMG...
Fetching data for CB...
Fetching data for CHD...
Fetching data for CI...
Fetching data for CINF...
Fetching data for CTAS...
Fetching data for CSCO...
Fetching data for C...
Fetching data for CFG...
Fetching data for CLX...
Fetching data for CME...
Fetching data for CMS...
Fetching data for KO...

In [None]:
import pandas as pd

# Load the dataset
file_path = "sp500_stocks_custom_range67.csv"
df = pd.read_csv(file_path)

# Load the actual top 70 S&P 500 tickers from the market
top_70_tickers = ["AAPL", "MSFT", "AMZN", "GOOGL", "GOOG", "BRK.B", "NVDA", "TSLA", "META", "V", "UNH", "JNJ", "XOM", "JPM", "WMT", "MA", "PG", "LLY", "HD", "CVX", "MRK", "ABBV", "PEP", "KO", "AVGO", "COST", "MCD", "DHR", "ACN", "TXN", "LIN", "NEE", "VZ", "WFC", "BMY", "PM", "SCHW", "MS", "RTX", "UPS", "UNP", "INTC", "LOW", "QCOM", "HON", "ORCL", "AMT", "IBM", "TMO", "CAT", "GS", "MDT", "BLK", "LMT", "CB", "NOW", "SPGI", "TGT", "ISRG", "DE", "ADI", "GE","PYPL", "ADBE", "NFLX", "CSCO", "AMGN", "CRM", "MO", "SO"]

# Filter the dataset for only the top 70 tickers
top_70_stocks = df[df['Ticker'].isin(top_70_tickers)]

# Save to a new CSV file
top_70_file_path = "top_70_sp500_stocksy5.csv"
top_70_stocks.to_csv(top_70_file_path, index=False)

print(f"Top 70 stocks saved to {top_70_file_path}")
