In [1]:
import pandas as pd

# Load analyst ratings data
df_news = pd.read_csv("../data/raw/raw_analyst_ratings.csv", index_col=0)

# Convert date to datetime
df_news['date'] = pd.to_datetime(df_news['date'], errors='coerce')

# Filter for one stock symbol (e.g., 'A') for this example
df_news = df_news[df_news['stock'] == 'A']

print("News Headlines Sample:")
print(df_news[['headline', 'date']].head())

News Headlines Sample:
                                            headline                      date
0            Stocks That Hit 52-Week Highs On Friday 2020-06-05 10:30:54-04:00
1         Stocks That Hit 52-Week Highs On Wednesday 2020-06-03 10:45:20-04:00
2                      71 Biggest Movers From Friday 2020-05-26 04:30:07-04:00
3       46 Stocks Moving In Friday's Mid-Day Session 2020-05-22 12:45:06-04:00
4  B of A Securities Maintains Neutral on Agilent... 2020-05-22 11:38:59-04:00


In [2]:
#3. Sentiment Scoring Using VADER
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

# Apply sentiment scoring
df_news['sentiment'] = df_news['headline'].apply(lambda x: sia.polarity_scores(x)['compound'])

print("\nHeadline Sentiment Scores:")
print(df_news[['headline', 'sentiment']].head())


Headline Sentiment Scores:
                                            headline  sentiment
0            Stocks That Hit 52-Week Highs On Friday      0.000
1         Stocks That Hit 52-Week Highs On Wednesday      0.000
2                      71 Biggest Movers From Friday      0.000
3       46 Stocks Moving In Friday's Mid-Day Session      0.000
4  B of A Securities Maintains Neutral on Agilent...      0.296


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\eep\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
# 4. Get Historical Stock Price Data
import yfinance as yf

# Download historical stock prices
ticker = 'A'
start_date = df_news['date'].min().date()
end_date = df_news['date'].max().date()

df_stock = yf.download(ticker, start=start_date, end=end_date, progress=False)

# Reset index to make Date a column
df_stock.reset_index(inplace=True)
df_stock['Date'] = pd.to_datetime(df_stock['Date'])

print("\nStock Price Data Sample:")
print(df_stock.head())

YF.download() has changed argument auto_adjust default to True

Stock Price Data Sample:
Price        Date      Close       High        Low       Open   Volume
Ticker                     A          A          A          A        A
0      2020-05-22  82.158417  84.759096  81.404309  82.177749  5063100
1      2020-05-26  83.270203  83.908292  82.796474  83.366888  3173400
2      2020-05-27  83.318558  83.608600  81.568658  83.434576  1917600
3      2020-05-28  83.811623  84.933103  83.202536  83.985646  1908700
4      2020-05-29  85.213493  85.551871  83.598944  84.275699  2394500


In [18]:
print("News DataFrame Columns:")
print(df_news.columns)

print("\nStock DataFrame Columns:")
print(df_stock.columns)

News DataFrame Columns:
Index(['headline', 'url', 'publisher', 'date', 'stock', 'sentiment'], dtype='object')

Stock DataFrame Columns:
MultiIndex([(  'Date',  ''),
            ( 'Close', 'A'),
            (  'High', 'A'),
            (   'Low', 'A'),
            (  'Open', 'A'),
            ('Volume', 'A')],
           names=['Price', 'Ticker'])


In [19]:
df_stock.rename(columns={
    'Close_A': 'Close',
    'Open_A': 'Open',
    'High_A': 'High',
    'Low_A': 'Low',
    'Volume_A': 'Volume'
}, inplace=True)

In [20]:
df_merged = pd.merge(
    df_news[['Date', 'headline', 'sentiment']],
    df_stock[['Date', 'Close', 'Open', 'High', 'Low', 'Volume']],
    on='Date',
    how='inner'
)

KeyError: "['Date'] not in index"

In [21]:
# Convert stock Date column to timezone-aware with UTC-4
df_stock['Date'] = df_stock['Date'].dt.tz_localize('UTC-04:00')

# Now merge again
df_merged = pd.merge(
    df_news[['Date', 'headline', 'sentiment']],
    df_stock[['Date', 'Close', 'Open', 'High', 'Low', 'Volume']],
    on='Date',
    how='inner'
)

print("\nMerged DataFrame Sample:")
print(df_merged.head())

KeyError: "['Date'] not in index"

In [22]:
print("df_news['Date'] dtype:", df_news['Date'].dtype)
print("df_stock['Date'] dtype:", df_stock['Date'].dtype)

KeyError: 'Date'