In [1]:
import requests
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from datetime import datetime, timedelta
import yfinance as yf

#Phase 1: get news data via newsapi.org

# Set your API key
api_key = '71015ea355524adaa5463cf1651763b6'

# Define the date range
end_date = datetime.now()
start_date = end_date - timedelta(days=30) # free tier caps us off at 30 days, so maybe we can do a day by day analysis?

# Fetch news articles
url = (f'https://newsapi.org/v2/everything?q=stock market&from={start_date.strftime("%Y-%m-%d")}'
       f'&to={end_date.strftime("%Y-%m-%d")}&language=en&sortBy=publishedAt&apiKey={api_key}')
response = requests.get(url)
# print(response.json())
articles = response.json()['articles']

#Store the articles in a pandas df (could use spark in future for improved performance/scalability)
df = pd.DataFrame(articles)
print(df[['publishedAt', 'title', 'description']])

             publishedAt                                              title  \
0   2024-06-25T23:59:50Z  $100 billion m-cap ICICI Bank 6th Indian compa...   
1   2024-06-25T23:57:03Z  2 ASX shares that would pass Peter Lynch's fav...   
2   2024-06-25T23:44:00Z  Hecla Mining VP sells $71,411 in stock, acquir...   
3   2024-06-25T23:41:07Z  Will Ford Pay Another Special Dividend in 2025...   
4   2024-06-25T23:35:54Z  Quest 2 Is Now Out Of Stock In The US - Is The...   
..                   ...                                                ...   
95  2024-06-25T21:23:56Z  Saba Capital Management buys Destra Multi-Alte...   
96  2024-06-25T21:23:00Z  Twelve Seas Investment Company II Announces Te...   
97  2024-06-25T21:16:42Z  Crane Advisory LLC Increases Stock Position in...   
98  2024-06-25T21:15:21Z  Apple shares rise Tuesday, but still underperf...   
99  2024-06-25T21:14:31Z                                       News 6/26/24   

                                          descripti

In [2]:
#Phase 2: Pre-processing & Sentiment Analysis
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# Function to get the sentiment score
def get_sentiment_score(text):
    return sid.polarity_scores(text)['compound'] if text else 0
df['sentiment'] = df['description'].apply(get_sentiment_score)
# df['sentiment'] = df['description'].apply(lambda x: sid.polarity_scores(x)['compound'] if x else 0)

#Converting publishedAt to datetime
df['publishedAt'] = pd.to_datetime(df['publishedAt'])

#Grouping by day and calculating average sentiment
df['day'] = df['publishedAt'].dt.date
daily_sentiment = df.groupby('day')['sentiment'].mean().reset_index()

#Grouping by week and calculating average sentiment
df['week'] = df['publishedAt'].dt.to_period('W').apply(lambda r: r.start_time)
weekly_sentiment = df.groupby('week')['sentiment'].mean().reset_index()

print(daily_sentiment)
print(weekly_sentiment)

          day  sentiment
0  2024-06-25   0.264103
        week  sentiment
0 2024-06-24   0.264103


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  df['week'] = df['publishedAt'].dt.to_period('W').apply(lambda r: r.start_time)


In [3]:
#fetching stock market data for daily and weekly trends

# Define the stock ticker and date range
ticker = 'SPY'  # S&P 500 ETF as an example
end_date = datetime.now()
start_date = end_date - timedelta(days=30)
stock_data = yf.download(ticker, start=start_date, end=end_date, interval='1d')

# Calculate daily returns
stock_data['daily_return'] = stock_data['Adj Close'].pct_change()

# Reset the index to join with sentiment data
stock_data.reset_index(inplace=True)

# Convert Date to date
stock_data['Date'] = stock_data['Date'].dt.date

# Display the stock data
print("Stock Data with Daily Returns:\n", stock_data[['Date', 'daily_return']])

# Calculate weekly returns
stock_data['week'] = pd.to_datetime(stock_data['Date']).dt.to_period('W').apply(lambda r: r.start_time)
weekly_stock_data = stock_data.groupby('week').apply(lambda x: (x['Adj Close'].iloc[-1] - x['Adj Close'].iloc[0]) / x['Adj Close'].iloc[0]).reset_index()
weekly_stock_data.columns = ['week', 'weekly_return']

# Display the weekly stock data
print("Stock Data with Weekly Returns:\n", weekly_stock_data)

[*********************100%%**********************]  1 of 1 completed

Stock Data with Daily Returns:
           Date  daily_return
0   2024-05-28           NaN
1   2024-05-29     -0.007002
2   2024-05-30     -0.006634
3   2024-05-31      0.009108
4   2024-06-03      0.000815
5   2024-06-04      0.001118
6   2024-06-05      0.011885
7   2024-06-06     -0.000019
8   2024-06-07     -0.001216
9   2024-06-10      0.003090
10  2024-06-11      0.002408
11  2024-06-12      0.008213
12  2024-06-13      0.002013
13  2024-06-14      0.000608
14  2024-06-17      0.007959
15  2024-06-18      0.002541
16  2024-06-20     -0.002717
17  2024-06-21     -0.001341
18  2024-06-24     -0.003251
19  2024-06-25      0.003851
Stock Data with Weekly Returns:
         week  weekly_return
0 2024-05-27      -0.004605
1 2024-06-03       0.011766
2 2024-06-10       0.013292
3 2024-06-17      -0.001523
4 2024-06-24       0.003851





In [10]:
# Joining the sentiment data with stock performance for daily and weekly trends and correlating them

# Merge daily sentiment and stock data
daily_combined_data = pd.merge(daily_sentiment, stock_data[['Date', 'daily_return']], left_on='day', right_on='Date', how='inner')

print(daily_combined_data.head(10))

# # Display the daily combined data
# print("Daily Combined Data:\n", daily_combined_data[['day', 'sentiment', 'daily_return']])

# # Calculate daily correlation
# daily_correlation = daily_combined_data['sentiment'].corr(daily_combined_data['daily_return'])
# print(f'Correlation between daily sentiment and daily stock performance: {daily_correlation}')


          day  sentiment        Date  daily_return
0  2024-06-25   0.264103  2024-06-25      0.003851
