In [4]:
import requests
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from datetime import datetime, timedelta
import yfinance as yf

#Phase 1: get news data via newsapi.org

# Set your API key
api_key = '71015ea355524adaa5463cf1651763b6'

# Define the date range
end_date = datetime.now()
start_date = end_date - timedelta(days=30) # free tier caps us off at 30 days, so maybe we can do a day by day analysis?

articles_list = []

for i in range(4):
    chunk_start_date = start_date + timedelta(days=i*7)
    chunk_end_date = chunk_start_date + timedelta(days=7)
    
    url = (f'https://newsapi.org/v2/everything?q=stock market&from={chunk_start_date.strftime("%Y-%m-%d")}'
           f'&to={chunk_end_date.strftime("%Y-%m-%d")}&language=en&sortBy=publishedAt&apiKey={api_key}')
    response = requests.get(url)
    
    if response.status_code == 200:
        articles = response.json().get('articles', [])
        articles_list.extend(articles)
    else:
        print(f"Error fetching data: {response.status_code}")

# Store the articles in a pandas df
df = pd.DataFrame(articles_list)
print(df[['publishedAt', 'title', 'description']])

              publishedAt                                              title  \
0    2024-06-02T23:58:41Z         The Fastest Declining Large City In The US   
1    2024-06-02T23:40:20Z  Mad Paws Holdings Limited (ASX:MPA): Are Analy...   
2    2024-06-02T23:34:07Z  ARE : Partial Correction to “Notice of the 15t...   
3    2024-06-02T23:24:33Z  Saudi Arabia’s oil giant sees massive stock of...   
4    2024-06-02T23:13:05Z  Ships Diverted From Red Sea Send Ripple Effect...   
..                    ...                                                ...   
395  2024-06-23T17:26:45Z  PepsiCo, Inc. (NASDAQ:PEP) Shares Acquired by ...   
396  2024-06-23T17:18:43Z  Triangle Securities Wealth Management Sells 48...   
397  2024-06-23T17:18:42Z  Visa Inc. (NYSE:V) Holdings Lowered by Liberty...   
398  2024-06-23T17:18:42Z  Maryland Capital Advisors Inc. Increases Posit...   
399  2024-06-23T17:18:42Z  Rockland Trust Co. Has $36.26 Million Position...   

                                       

In [5]:
# Phase 2: Pre-processing & Sentiment Analysis
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# Function to get the sentiment score
def get_sentiment_score(text):
    return sid.polarity_scores(text)['compound'] if text else 0

df['sentiment'] = df['description'].apply(get_sentiment_score)

# Convert 'publishedAt' to datetime and ensure dates are parsed correctly
df['publishedAt'] = pd.to_datetime(df['publishedAt']).dt.date

# Verify the parsing
print(df['publishedAt'].head(10))

# Group by day and calculate average sentiment
df['day'] = df['publishedAt']
daily_sentiment = df.groupby('day')['sentiment'].mean().reset_index()

# Group by week and calculate average sentiment
df['week'] = pd.to_datetime(df['day']).dt.to_period('W').apply(lambda r: r.start_time)
weekly_sentiment = df.groupby('week')['sentiment'].mean().reset_index()

# Display the daily and weekly sentiment
print(daily_sentiment.head(10))
print(weekly_sentiment.head(10))

# Display the range of dates
print(f"Date range: {df['day'].min()} to {df['day'].max()}")


0    2024-06-02
1    2024-06-02
2    2024-06-02
3    2024-06-02
4    2024-06-02
5    2024-06-02
6    2024-06-02
7    2024-06-02
8    2024-06-02
9    2024-06-02
Name: publishedAt, dtype: object
          day  sentiment
0  1970-01-01   0.000000
1  2024-06-02   0.277242
2  2024-06-09   0.244690
3  2024-06-16   0.378506
4  2024-06-23   0.379557
        week  sentiment
0 1969-12-29   0.000000
1 2024-05-27   0.277242
2 2024-06-03   0.244690
3 2024-06-10   0.378506
4 2024-06-17   0.379557
Date range: 1970-01-01 to 2024-06-23


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
#fetching stock market data for daily and weekly trends

# Define the stock ticker and date range
ticker = 'SPY'  # S&P 500 ETF as an example
end_date = datetime.now().date()
start_date = end_date - timedelta(days=30)
stock_data = yf.download(ticker, start=start_date, end=end_date, interval='1d')

# Calculate daily returns
stock_data['daily_return'] = stock_data['Adj Close'].pct_change()

# Reset the index to join with sentiment data
stock_data.reset_index(inplace=True)

# Convert Date to date
stock_data['Date'] = stock_data['Date'].dt.date

# Display the stock data
print("Stock Data with Daily Returns:\n", stock_data[['Date', 'daily_return']])

# Calculate weekly returns
stock_data['week'] = pd.to_datetime(stock_data['Date']).dt.to_period('W').apply(lambda r: r.start_time)
weekly_stock_data = stock_data.groupby('week').apply(lambda x: (x['Adj Close'].iloc[-1] - x['Adj Close'].iloc[0]) / x['Adj Close'].iloc[0]).reset_index()
weekly_stock_data.columns = ['week', 'weekly_return']

# Display the weekly stock data
print("Stock Data with Weekly Returns:\n", weekly_stock_data)

[*********************100%%**********************]  1 of 1 completed

Stock Data with Daily Returns:
           Date  daily_return
0   2024-05-28           NaN
1   2024-05-29     -0.007002
2   2024-05-30     -0.006634
3   2024-05-31      0.009108
4   2024-06-03      0.000815
5   2024-06-04      0.001118
6   2024-06-05      0.011885
7   2024-06-06     -0.000019
8   2024-06-07     -0.001216
9   2024-06-10      0.003090
10  2024-06-11      0.002408
11  2024-06-12      0.008213
12  2024-06-13      0.002013
13  2024-06-14      0.000608
14  2024-06-17      0.007959
15  2024-06-18      0.002541
16  2024-06-20     -0.002717
17  2024-06-21     -0.001341
18  2024-06-24     -0.003251
Stock Data with Weekly Returns:
         week  weekly_return
0 2024-05-27      -0.004605
1 2024-06-03       0.011766
2 2024-06-10       0.013292
3 2024-06-17      -0.001523
4 2024-06-24       0.000000





In [None]:
print(weekly_stock_data.head(10))
print(daily_sentiment.head(10))

In [11]:
# Verify the date ranges in daily_sentiment again after filtering
print("Filtered Daily Sentiment Dates:\n", daily_sentiment['day'].unique())

# Verify the date ranges in stock_data
print("Stock Data Dates:\n", stock_data['Date'].unique())

# Merging daily sentiment and stock data after filtering invalid dates
daily_combined_data = pd.merge(daily_sentiment, stock_data[['Date', 'daily_return']], left_on='day', right_on='Date', how='inner')

# Display the daily combined data
print("Daily Combined Data:\n", daily_combined_data.head(10))

# Handle missing data
daily_combined_data.dropna(subset=[# Verify the date ranges in daily_sentiment again after filtering
print("Filtered Daily Sentiment Dates:\n", daily_sentiment['day'].unique())

# Verify the date ranges in stock_data
print("Stock Data Dates:\n", stock_data['Date'].unique())

# Merging daily sentiment and stock data after filtering invalid dates
daily_combined_data = pd.merge(daily_sentiment, stock_data[['Date', 'daily_return']], left_on='day', right_on='Date', how='inner')

# Display the daily combined data
print("Daily Combined Data:\n", daily_combined_data.head(10))

# Handle missing data
daily_combined_data.dropna(subset=['daily_return'], inplace=True)

# Display the daily combined data after dropping NaNs
print("Daily Combined Data After Dropping NaNs:\n", daily_combined_data[['day', 'sentiment', 'daily_return']].head(10))

# Calculate daily correlation if there are enough data points
if len(daily_combined_data) > 1:
    daily_correlation = daily_combined_data['sentiment'].corr(daily_combined_data['daily_return'])
    print(f'Correlation between daily sentiment and daily stock performance: {daily_correlation}')
else:
    print("Not enough data points to calculate daily correlation.")
'daily_return'], inplace=True)

# Display the daily combined data after dropping NaNs
print("Daily Combined Data After Dropping NaNs:\n", daily_combined_data[['day', 'sentiment', 'daily_return']].head(10))

# Calculate daily correlation if there are enough data points
if len(daily_combined_data) > 1:
    daily_correlation = daily_combined_data['sentiment'].corr(daily_combined_data['daily_return'])
    print(f'Correlation between daily sentiment and daily stock performance: {daily_correlation}')
else:
    print("Not enough data points to calculate daily correlation.")


Filtered Daily Sentiment Dates:
 [datetime.date(2024, 6, 2) datetime.date(2024, 6, 9)
 datetime.date(2024, 6, 16) datetime.date(2024, 6, 23)]
Stock Data Dates:
 [datetime.date(2024, 5, 28) datetime.date(2024, 5, 29)
 datetime.date(2024, 5, 30) datetime.date(2024, 5, 31)
 datetime.date(2024, 6, 3) datetime.date(2024, 6, 4)
 datetime.date(2024, 6, 5) datetime.date(2024, 6, 6)
 datetime.date(2024, 6, 7) datetime.date(2024, 6, 10)
 datetime.date(2024, 6, 11) datetime.date(2024, 6, 12)
 datetime.date(2024, 6, 13) datetime.date(2024, 6, 14)
 datetime.date(2024, 6, 17) datetime.date(2024, 6, 18)
 datetime.date(2024, 6, 20) datetime.date(2024, 6, 21)
 datetime.date(2024, 6, 24)]
Daily Combined Data:
 Empty DataFrame
Columns: [day, sentiment, Date, daily_return]
Index: []
Daily Combined Data After Dropping NaNs:
 Empty DataFrame
Columns: [day, sentiment, daily_return]
Index: []
Not enough data points to calculate daily correlation.
