In [2]:
import requests
from datetime import datetime, timedelta
import time
import os
import alpaca_trade_api as tradeapi
import pandas as pd
from transformers import pipeline, BertTokenizer, BertForSequenceClassification

In [3]:
from dotenv import load_dotenv
import os 


load_dotenv()


ALPACA_API_KEY = os.getenv("ALPACA_API_KEY")
ALPACA_SECRET_KEY = os.getenv("ALPACA_SECRET_KEY")
ALPACA_URL = os.getenv("ALPACA_URL")


alpaca = tradeapi.REST(ALPACA_API_KEY, ALPACA_SECRET_KEY, base_url=ALPACA_URL, api_version='v2')

In [4]:
from transformers import pipeline, BertTokenizer, BertForSequenceClassification
import pandas as pd

# Load the model and tokenizer
model_name = "yiyanghkust/finbert-tone"  # Example of a financial sentiment model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [5]:
# Function to split text into 512-token chunks based on tokenization
def split_into_chunks(text, max_length=500):
    tokens = tokenizer(text, return_tensors="pt", truncation=False)['input_ids'][0]
    # Ensure each chunk is no more than 512 tokens
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk = tokens[i:i + max_length]
        # Make sure the chunk is exactly 512 tokens or less
        if len(chunk) > max_length:
            chunk = chunk[:max_length]
        chunks.append(chunk)
    return chunks

In [6]:
# Function to analyze sentiment for long texts
def analyze_sentiment(text):
    chunks = split_into_chunks(text)
    sentiments = []
    for chunk in chunks:
        # Convert tokens back to text before sentiment analysis
        chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
        sentiments.append(nlp(chunk_text)[0])
    
    # Aggregate sentiment scores (e.g., by averaging)
    avg_sentiment_score = sum(s['score'] for s in sentiments) / len(sentiments)
    # Determine overall sentiment by majority vote or averaging
    positive_scores = sum(s['score'] for s in sentiments if s['label'] == 'positive')
    negative_scores = sum(s['score'] for s in sentiments if s['label'] == 'negative')
    sentiment_label = 'positive' if positive_scores >= negative_scores else 'negative'
    
    return sentiment_label, avg_sentiment_score

In [7]:
def get_historical_stock_data(symbols, start_date, end_date):
    # Use the correct TimeFrame object for daily data
    timeframe = tradeapi.TimeFrame.Day
    
    all_data = []
    
    for symbol in symbols:
        bars = alpaca.get_bars(
            symbol,
            timeframe=timeframe,
            start=start_date.strftime('%Y-%m-%dT%H:%M:%SZ'),
            end=end_date.strftime('%Y-%m-%dT%H:%M:%SZ'),
            adjustment='raw',
            feed='iex'
        )
        
        data = []
        for bar in bars:
            data.append({
                'symbol': symbol,  # Add the symbol to the data
                'time': bar.t,
                'open': bar.o,
                'high': bar.h,
                'low': bar.l,
                'close': bar.c,
                'volume': bar.v
            })
        
        all_data.extend(data)
    
    return pd.DataFrame(all_data)

In [8]:
# Function to load the most recent date from the CSV file
def load_last_update_date(file_path="stock_data.csv"):
    try:
        df = pd.read_csv(file_path)

        # Ensure that the 'date' column is converted to datetime, coercing errors
        df['date'] = pd.to_datetime(df['date'], errors='coerce')

        # Drop rows where 'date' couldn't be converted to datetime (i.e., NaT)
        df = df.dropna(subset=['date'])

        # Find the most recent date
        most_recent_date = df['date'].max()

        return most_recent_date
    except (FileNotFoundError, IndexError, KeyError):
        return None

In [9]:
# Function to save the current date as the last update date
def save_last_update_date(date, file_path="stock_data.csv"):
    df = pd.read_csv(file_path)
    df['date'] = pd.to_datetime(df['date'])
    df.loc[df['date'].idxmax(), 'date'] = date
    df.to_csv(file_path, index=False)

In [10]:
# Function to get news from the Alpaca API
def get_new_news(symbols, start_date, end_date):
    url = "https://data.alpaca.markets/v1beta1/news"
    headers = {
        "APCA-API-KEY-ID": ALPACA_API_KEY,
        "APCA-API-SECRET-KEY": ALPACA_SECRET_KEY
    }
    news = []
    page_token = None
    
    while True:
        params = {
            "symbols": ",".join(symbols),
            "start": start_date.strftime('%Y-%m-%dT%H:%M:%SZ'),
            "end": end_date.strftime('%Y-%m-%dT%H:%M:%SZ'),
            "limit": 50,
            "page_token": page_token
        }
        try:
            response = requests.get(url, headers=headers, params=params)
            response.raise_for_status()
            result = response.json()
            
            news.extend(result.get('news', []))
            
            page_token = result.get('next_page_token')
            if not page_token:
                break
            
        except requests.exceptions.HTTPError as e:
            if response.status_code == 429:
                print("Rate limit reached. Sleeping for 10 seconds...")
                time.sleep(10)
            else:
                print(f"HTTP error occurred: {e}")
                break
        except Exception as e:
            print(f"An error occurred: {e}")
            break
        
    return news

In [11]:
def update_stock_data(symbols, file_path="stock_data.csv"):
    # Load the last update date
    last_update = load_last_update_date(file_path)
    if last_update is None:
        last_update = datetime.utcnow() - timedelta(days=30)  # Default to last 30 days
    else:
        last_update = pd.to_datetime(last_update)  # Convert to datetime
    
    # Get the current date
    today = datetime.utcnow()

    # Stop execution if last_update and today are the same
    if last_update.date() == today.date():
        print("Data is already up-to-date. No new data to fetch.")
        return
    
    # Retrieve news from Alpaca
    new_news = get_new_news(symbols, last_update, today)
    
    # Convert new news data to DataFrame
    new_news_df = pd.DataFrame(new_news)

    # Keep only the necessary columns and prepare the data
    if not new_news_df.empty:
        new_news_df['created_at'] = pd.to_datetime(new_news_df['created_at'])
        new_news_df['date'] = new_news_df['created_at'].dt.date
        new_news_df = new_news_df.explode('symbols')
        new_news_df['sentiment'], new_news_df['sentiment_score'] = zip(*new_news_df['headline'].apply(analyze_sentiment))
        new_news_df = new_news_df[['date', 'symbols', 'sentiment_score']]
        daily_sentiment = new_news_df.groupby(['date', 'symbols'])['sentiment_score'].mean().reset_index()
        daily_sentiment.columns = ['date', 'symbol', 'average_sentiment_score']
    else:
        daily_sentiment = pd.DataFrame()

    # Fetch historical stock data
    historical_data = get_historical_stock_data(symbols, last_update, today)

    historical_data['date'] = pd.to_datetime(historical_data['time']).dt.date

    # Merge historical stock data with sentiment data
    stock_data_merged = pd.merge(historical_data, daily_sentiment, how='left', on=['date', 'symbol'])

    # Load existing stock data
    try:
        existing_data = pd.read_csv(file_path)
    except FileNotFoundError:
        existing_data = pd.DataFrame()

    # Combine the new merged data with the existing data
    combined_data = pd.concat([existing_data, stock_data_merged], ignore_index=True)
    
    # Save the updated DataFrame
    combined_data.to_csv(file_path, index=False)
    
    # Save the most recent update date
    save_last_update_date(today, file_path)


In [12]:
top_50_sp500_stocks = [
    'AAPL',  # Apple Inc.
    'MSFT',  # Microsoft Corporation
    'AMZN',  # Amazon.com Inc.
    'NVDA',  # NVIDIA Corporation
    'GOOGL', # Alphabet Inc. (Class A)
    'GOOG',  # Alphabet Inc. (Class C)
    'TSLA',  # Tesla Inc.
    'META',  # Meta Platforms Inc.
    'BRK.B', # Berkshire Hathaway Inc. (Class B)
    'UNH',   # UnitedHealth Group Incorporated
    'JNJ',   # Johnson & Johnson
    'XOM',   # Exxon Mobil Corporation
    'V',     # Visa Inc.
    'PG',    # Procter & Gamble Co.
    'JPM',   # JPMorgan Chase & Co.
    'LLY',   # Eli Lilly and Company
    'MA',    # Mastercard Incorporated
    'HD',    # The Home Depot Inc.
    'CVX',   # Chevron Corporation
    'MRK',   # Merck & Co. Inc.
    'PEP',   # PepsiCo Inc.
    'ABBV',  # AbbVie Inc.
    'KO',    # The Coca-Cola Company
    'PFE',   # Pfizer Inc.
    'AVGO',  # Broadcom Inc.
    'COST',  # Costco Wholesale Corporation
    'MCD',   # McDonald's Corporation
    'TMO',   # Thermo Fisher Scientific Inc.
    'WMT',   # Walmart Inc.
    'DHR',   # Danaher Corporation
    'NKE',   # NIKE Inc.
    'DIS',   # The Walt Disney Company
    'ADBE',  # Adobe Inc.
    'NFLX',  # Netflix Inc.
    'VZ',    # Verizon Communications Inc.
    'CSCO',  # Cisco Systems Inc.
    'ABT',   # Abbott Laboratories
    'ACN',   # Accenture plc
    'NEE',   # NextEra Energy Inc.
    'LIN',   # Linde plc
    'TXN',   # Texas Instruments Incorporated
    'MDT',   # Medtronic plc
    'PM',    # Philip Morris International Inc.
    'WFC',   # Wells Fargo & Company
    'HON',   # Honeywell International Inc.
    'QCOM',  # QUALCOMM Incorporated
    'BMY',   # Bristol-Myers Squibb Company
    'LOW',   # Lowe's Companies Inc.
    'UNP',   # Union Pacific Corporation
    'RTX'    # Raytheon Technologies Corporation
]


In [13]:
update_stock_data(top_50_sp500_stocks)

HTTP error occurred: 403 Client Error: Forbidden for url: https://data.alpaca.markets/v1beta1/news?symbols=AAPL%2CMSFT%2CAMZN%2CNVDA%2CGOOGL%2CGOOG%2CTSLA%2CMETA%2CBRK.B%2CUNH%2CJNJ%2CXOM%2CV%2CPG%2CJPM%2CLLY%2CMA%2CHD%2CCVX%2CMRK%2CPEP%2CABBV%2CKO%2CPFE%2CAVGO%2CCOST%2CMCD%2CTMO%2CWMT%2CDHR%2CNKE%2CDIS%2CADBE%2CNFLX%2CVZ%2CCSCO%2CABT%2CACN%2CNEE%2CLIN%2CTXN%2CMDT%2CPM%2CWFC%2CHON%2CQCOM%2CBMY%2CLOW%2CUNP%2CRTX&start=2024-08-27T01%3A14%3A06Z&end=2024-08-31T19%3A56%3A34Z&limit=50


KeyError: 'date'

In [14]:
# read stock data and display head

stock_data = pd.read_csv("stock_data.csv")
print(stock_data.head())

  symbol                       time     open    high     low    close  volume  \
0   AAPL  2024-07-25 00:00:00-04:00  218.880  220.81  214.64  217.420  642703   
1   AAPL  2024-07-26 00:00:00-04:00  218.940  219.48  216.04  218.030  661067   
2   AAPL  2024-07-29 00:00:00-04:00  217.375  219.28  215.79  218.185  381966   
3   AAPL  2024-07-30 00:00:00-04:00  219.300  220.27  216.12  218.680  559407   
4   AAPL  2024-07-31 00:00:00-04:00  221.520  223.81  220.91  222.180  549326   

                         date  average_sentiment_score  log_sentiment_score  
0  2024-07-25 00:00:00.000000                 0.947577            -0.053847  
1  2024-07-26 00:00:00.000000                 0.972036            -0.028362  
2  2024-07-29 00:00:00.000000                 0.955637            -0.045377  
3  2024-07-30 00:00:00.000000                 0.964756            -0.035880  
4  2024-07-31 00:00:00.000000                 0.989995            -0.010056  
