In [2]:
import pandas as pd
import requests
from io import StringIO

def get_sp500_tickers():
    """Scrapes the list of S&P 500 tickers from Wikipedia."""
    print("Fetching S&P 500 ticker list from Wikipedia...")
    
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    
    # Add a User-Agent header to mimic a real browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }
    
    # Make the request with the new headers
    response = requests.get(url, headers=headers)
    
    # Use StringIO to wrap the text, which is the modern way to use read_html
    tables = pd.read_html(StringIO(response.text))
    
    # The rest of your function remains the same
    sp500_df = tables[0]
    tickers = sp500_df['Symbol'].str.replace('.', '-', regex=False).tolist()
    print(f"Found {len(tickers)} tickers.")
    return tickers

# Get the list
sp500_tickers = get_sp500_tickers()
print(sp500_tickers[:10]) # Print first 10 to check

Fetching S&P 500 ticker list from Wikipedia...
Found 504 tickers.
['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A']


In [3]:
import pandas as pd
import yfinance as yf
from gnews import GNews
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import time
from datetime import date, timedelta
import nltk

# --- NLTK Setup ---
try:
    nltk.data.find('sentiment/vader_lexicon.zip')
except LookupError:
    print("Vader lexicon not found. Downloading...")
    nltk.download('vader_lexicon')
print("Vader lexicon is ready.")

# --- Parameters ---
END_DATE = date.today()
START_DATE = END_DATE - timedelta(days=30)

# --- Initialize ---
gnews = GNews(start_date=START_DATE, end_date=END_DATE)
sia = SentimentIntensityAnalyzer() # Initialize it once here
all_dataframes = []
# --- Main Loop ---
print("\nStarting data collection for all S&P 500 stocks...")
for i, ticker in enumerate(sp500_tickers[:6]):
    print(f"Processing {i+1}/{len(sp500_tickers)}: {ticker}")
    try:
        # --- 1. Fetch Stock Data ---
        stock_df = yf.download(ticker, start=START_DATE, end=END_DATE, progress=False)
        
        # !!! THIS IS THE CRITICAL FIX !!!
        # Flatten the multi-level column index if it exists
        if isinstance(stock_df.columns, pd.MultiIndex):
            stock_df.columns = stock_df.columns.droplevel(1)

        if stock_df.empty:
            print(f"  No stock data for {ticker}. Skipping.")
            continue

        # --- 2. Fetch News Data ---
        news_articles = gnews.get_news(f'{ticker} stock')
        if not news_articles:
            print(f"  No news for {ticker}. Proceeding without sentiment.")
            stock_df['sentiment'] = 0.0
        else:
            news_df = pd.DataFrame(news_articles)
            # --- 3. Perform Sentiment Analysis ---
            news_df['sentiment'] = news_df['title'].apply(lambda title: sia.polarity_scores(title)['compound'])
            news_df['date'] = pd.to_datetime(news_df['published date']).dt.date
            
            # --- 4. Combine Datasets ---
            daily_sentiment = news_df.groupby('date')['sentiment'].mean()
            daily_sentiment.index = pd.to_datetime(daily_sentiment.index)
            
            # Now the .join() will work because the columns are simple
            stock_df = stock_df.join(daily_sentiment, how='left')
            
            stock_df['sentiment'].fillna(method='ffill', inplace=True)
            stock_df['sentiment'].fillna(0, inplace=True)

        # Add a ticker column for identification
        stock_df['ticker'] = ticker
        all_dataframes.append(stock_df)

        # IMPORTANT: Pause to be respectful to the APIs
        time.sleep(1)

    except Exception as e:
        print(f"  An error occurred for {ticker}: {e}")

# --- Final Combination ---
print("\nCombining all data into a single DataFrame...")
# This will now work because all_dataframes will not be empty
if all_dataframes:
    final_df = pd.concat(all_dataframes)
    print("\n--- Final Combined Dataset ---")
    print(final_df.head())
    print("...")
    print(final_df.tail())
else:
    print("No data was collected.")

Vader lexicon is ready.

Starting data collection for all S&P 500 stocks...
Processing 1/504: MMM


  stock_df = yf.download(ticker, start=START_DATE, end=END_DATE, progress=False)
08/28/2025 06:50:46 PM - ('Connection aborted.', OSError(22, 'Invalid argument'))


  No news for MMM. Proceeding without sentiment.
Processing 2/504: AOS


  stock_df = yf.download(ticker, start=START_DATE, end=END_DATE, progress=False)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  stock_df['sentiment'].fillna(method='ffill', inplace=True)
  stock_df['sentiment'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  stock_df['sentiment'].fillna(0,

Processing 3/504: ABT


  stock_df = yf.download(ticker, start=START_DATE, end=END_DATE, progress=False)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  stock_df['sentiment'].fillna(method='ffill', inplace=True)
  stock_df['sentiment'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  stock_df['sentiment'].fillna(0,

Processing 4/504: ABBV


  stock_df = yf.download(ticker, start=START_DATE, end=END_DATE, progress=False)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  stock_df['sentiment'].fillna(method='ffill', inplace=True)
  stock_df['sentiment'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  stock_df['sentiment'].fillna(0,

Processing 5/504: ACN


  stock_df = yf.download(ticker, start=START_DATE, end=END_DATE, progress=False)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  stock_df['sentiment'].fillna(method='ffill', inplace=True)
  stock_df['sentiment'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  stock_df['sentiment'].fillna(0,

Processing 6/504: ADBE


  stock_df = yf.download(ticker, start=START_DATE, end=END_DATE, progress=False)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  stock_df['sentiment'].fillna(method='ffill', inplace=True)
  stock_df['sentiment'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  stock_df['sentiment'].fillna(0,


Combining all data into a single DataFrame...

--- Final Combined Dataset ---
                 Close        High         Low        Open   Volume  \
Date                                                                  
2025-07-29  151.091873  152.395861  150.454817  151.699068  3918100   
2025-07-30  145.945648  151.798609  145.826205  151.778697  3529500   
2025-07-31  148.533691  148.872124  144.890521  145.517629  4390400   
2025-08-01  143.745819  146.045192  143.586551  145.826203  4074400   
2025-08-04  147.189896  147.498467  144.333101  144.721306  3641300   

            sentiment ticker  
Date                          
2025-07-29        0.0    MMM  
2025-07-30        0.0    MMM  
2025-07-31        0.0    MMM  
2025-08-01        0.0    MMM  
2025-08-04        0.0    MMM  
...
                 Close        High         Low        Open   Volume  \
Date                                                                  
2025-08-21  353.429993  353.820007  347.000000  349.910004  

In [4]:
final_df

Unnamed: 0_level_0,Close,High,Low,Open,Volume,sentiment,ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2025-07-29,151.091873,152.395861,150.454817,151.699068,3918100,0.000000,MMM
2025-07-30,145.945648,151.798609,145.826205,151.778697,3529500,0.000000,MMM
2025-07-31,148.533691,148.872124,144.890521,145.517629,4390400,0.000000,MMM
2025-08-01,143.745819,146.045192,143.586551,145.826203,4074400,0.000000,MMM
2025-08-04,147.189896,147.498467,144.333101,144.721306,3641300,0.000000,MMM
...,...,...,...,...,...,...,...
2025-08-21,353.429993,353.820007,347.000000,349.910004,2363200,0.000000,ADBE
2025-08-22,362.089996,362.649994,354.000000,355.799988,3058600,0.000000,ADBE
2025-08-25,363.209991,364.649994,361.399994,362.559998,2259500,0.012240,ADBE
2025-08-26,354.910004,363.230011,353.700012,363.029999,4067400,0.118087,ADBE


In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import matplotlib.pyplot as plt

# Assume 'final_df' is your loaded DataFrame
# For this example, we'll focus on a single stock
ticker_to_predict = 'ADBE'
df_stock = final_df[final_df['ticker'] == ticker_to_predict].copy()

# We'll use 'Close' price and 'sentiment' as features
data = df_stock[['Close', 'sentiment']].values

# Scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data)

# Create a separate scaler just for the 'Close' price for easy inverse transformation later
scaler_price = MinMaxScaler(feature_range=(0, 1))
scaler_price.fit_transform(df_stock[['Close']])

array([[1.        ],
       [0.81916305],
       [0.64885281],
       [0.38191598],
       [0.14035114],
       [0.13738175],
       [0.3230768 ],
       [0.12469616],
       [0.19972983],
       [0.        ],
       [0.12901475],
       [0.47017557],
       [0.40296856],
       [0.57219978],
       [0.63670674],
       [0.73900112],
       [0.53387285],
       [0.53387285],
       [0.76761102],
       [0.7978403 ],
       [0.57381914],
       [0.61268559]])

In [6]:
LOOK_BACK = 60
PREDICT_DAYS = 30

X_train, y_train = [], []

for i in range(LOOK_BACK, len(scaled_data) - PREDICT_DAYS):
    # X contains the last 60 days of [Close, sentiment]
    X_train.append(scaled_data[i-LOOK_BACK:i])
    # y contains the next 30 days of just the 'Close' price
    y_train.append(scaled_data[i:i+PREDICT_DAYS, 0])

X_train, y_train = np.array(X_train), np.array(y_train)

In [7]:
model = Sequential()

# First LSTM layer
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))

# Second LSTM layer
model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))

# Output layer - predicts 30 values
model.add(Dense(units=PREDICT_DAYS))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

IndexError: tuple index out of range