# Correlation Between News Sentiment and Stock Movements
This notebook demonstrates how to analyze the correlation between news headline sentiment and stock price movements using modular functions.

In [1]:
# Import required libraries and custom modules
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath(".."))
from src.news_stock_correlation import (
    normalize_dates, compute_sentiment, aggregate_daily_sentiment,
    compute_daily_returns, merge_sentiment_returns, compute_correlation
)  

## 1. Load News and Stock Data
Place your news data (CSV) and stock price data (CSV) in the `data/` directory. Adjust file paths as needed.

In [2]:
# Example file paths (update as needed)
news_path = '../data/raw_analyst_ratings.csv'
stock_path = '../data/yfinance_data/AAPL_historical_data.csv'

# Load data
news_df = pd.read_csv(news_path)
stock_df = pd.read_csv(stock_path)

# Display first few rows
news_df.head(), stock_df.head()

(   Unnamed: 0                                           headline  \
 0           0            Stocks That Hit 52-Week Highs On Friday   
 1           1         Stocks That Hit 52-Week Highs On Wednesday   
 2           2                      71 Biggest Movers From Friday   
 3           3       46 Stocks Moving In Friday's Mid-Day Session   
 4           4  B of A Securities Maintains Neutral on Agilent...   
 
                                                  url          publisher  \
 0  https://www.benzinga.com/news/20/06/16190091/s...  Benzinga Insights   
 1  https://www.benzinga.com/news/20/06/16170189/s...  Benzinga Insights   
 2  https://www.benzinga.com/news/20/05/16103463/7...         Lisa Levin   
 3  https://www.benzinga.com/news/20/05/16095921/4...         Lisa Levin   
 4  https://www.benzinga.com/news/20/05/16095304/b...         Vick Meyer   
 
                         date stock  
 0  2020-06-05 10:30:54-04:00     A  
 1  2020-06-03 10:45:20-04:00     A  
 2  2020-05-

## 2. Normalize Dates
Align date columns in both datasets for accurate merging.

In [3]:
# Normalize date columns
news_df = normalize_dates(news_df, 'date')
stock_df = normalize_dates(stock_df, 'Date')

## 3. Sentiment Analysis on News Headlines
Assign sentiment polarity scores to each news headline.

In [4]:
# Compute sentiment scores
news_df = compute_sentiment(news_df, text_col='headline')
news_df[["date", "headline", "sentiment"]].head()

Unnamed: 0,date,headline,sentiment
0,2020-06-05 10:30:54,Stocks That Hit 52-Week Highs On Friday,0.0
1,2020-06-03 10:45:20,Stocks That Hit 52-Week Highs On Wednesday,0.0
2,2020-05-26 04:30:07,71 Biggest Movers From Friday,0.0
3,2020-05-22 12:45:06,46 Stocks Moving In Friday's Mid-Day Session,0.0
4,2020-05-22 11:38:59,B of A Securities Maintains Neutral on Agilent...,0.0


## 4. Plot 1: Sentiment Score Histogram
Visualize the distribution of sentiment scores across all news headlines.

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(7,4))
news_df['sentiment'].hist(bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Sentiment Scores')
plt.xlabel('Sentiment Polarity')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

## 5. Aggregate Daily Sentiment
Calculate the average sentiment score for each day.

In [5]:
# Aggregate daily sentiment
sentiment_daily = aggregate_daily_sentiment(news_df, date_col='date')
sentiment_daily.head()

Unnamed: 0,date,avg_sentiment
0,2011-04-27 21:01:48,0.0
1,2011-04-28 13:49:29,0.136364
2,2011-04-28 15:00:36,0.0
3,2011-04-29 13:47:06,-0.166667
4,2011-04-29 16:11:05,0.5


## 6. Calculate Daily Stock Returns
Compute daily percentage changes in closing prices.

In [6]:
# Compute daily returns
returns_daily = compute_daily_returns(stock_df, date_col='Date', price_col='Close')
returns_daily.head()

Unnamed: 0,Date,daily_return
0,1980-12-12,
1,1980-12-15,-0.052171
2,1980-12-16,-0.073398
3,1980-12-17,0.024751
4,1980-12-18,0.028992


## 7. Merge Sentiment and Returns by Date
Combine the two datasets for correlation analysis.

In [7]:
# Ensure both DataFrames have the same date column name before merging
sentiment_daily = sentiment_daily.rename(columns={"date": "date", "Date": "date"})
returns_daily = returns_daily.rename(columns={"date": "date", "Date": "date"})

# Merge on date
merged = merge_sentiment_returns(sentiment_daily, returns_daily, date_col='date')
merged.head()

Unnamed: 0,date,avg_sentiment,daily_return


## 8. Correlation Analysis
Calculate the Pearson correlation coefficient between average daily sentiment and stock returns.

In [8]:
# Compute correlation
corr, pval = compute_correlation(merged)
if corr is not None and pval is not None:
    print(f"Pearson correlation: {corr:.3f} (p-value: {pval:.3g})")
else:
    print("Not enough data to compute correlation (need at least 2 valid data points).")

Not enough data to compute correlation (need at least 2 valid data points).


## 9. Visualize the Relationship
(Optional) Plot the relationship between daily sentiment and stock returns.

In [None]:
import matplotlib.pyplot as plt

if 'merged' in locals():
    plt.figure(figsize=(8,5))
    plt.scatter(merged['avg_sentiment'], merged['daily_return'], alpha=0.6)
    plt.xlabel('Average Daily Sentiment')
    plt.ylabel('Daily Stock Return')
    plt.title('Sentiment vs. Stock Return')
    plt.grid(True)
    plt.show()
else:
    print("The variable 'merged' is not defined. Please run the previous cells to create it.")

## Additional Analysis
### Plot 2: Stock Price Time Series
Visualize the closing price of the stock over time.

In [None]:
# Plot 2: Stock Price Time Series
plt.figure(figsize=(10,4))
plt.plot(stock_df['Date'], stock_df['Close'], label='Close Price')
plt.title('Stock Closing Price Over Time')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.tight_layout()
plt.show()

### Plot 3: News Volume by Date
Visualize the number of news headlines published each day.

In [None]:
# Plot 3: News Volume by Date
news_volume = news_df.copy()
news_volume['date'] = pd.to_datetime(news_volume['date'], errors='coerce')
daily_counts = news_volume['date'].dt.date.value_counts().sort_index()
plt.figure(figsize=(10,4))
daily_counts.plot(kind='bar')
plt.title('Number of News Headlines per Day')
plt.xlabel('Date')
plt.ylabel('Headline Count')
plt.tight_layout()
plt.show()

### Plot 4: Sentiment Time Series
Visualize the average daily sentiment over time, with an optional rolling average.

In [None]:
# Plot 4: Sentiment Time Series (rolling average)
import matplotlib.pyplot as plt
sentiment_daily_sorted = sentiment_daily.sort_values('date')
plt.figure(figsize=(10,4))
plt.plot(sentiment_daily_sorted['date'], sentiment_daily_sorted['avg_sentiment'], label='Daily Avg Sentiment', alpha=0.5)
plt.plot(sentiment_daily_sorted['date'], sentiment_daily_sorted['avg_sentiment'].rolling(window=7, min_periods=1).mean(), label='7-day Rolling Avg', color='orange')
plt.title('Sentiment Time Series (with 7-day Rolling Average)')
plt.xlabel('Date')
plt.ylabel('Average Sentiment')
plt.legend()
plt.tight_layout()
plt.show()

### Table 1: Descriptive Statistics of Returns

In [None]:
# Table 1: Descriptive Statistics of Returns
returns_stats = returns_daily['daily_return'].describe().to_frame()
returns_stats.loc['skew'] = returns_daily['daily_return'].skew()
returns_stats.loc['kurtosis'] = returns_daily['daily_return'].kurtosis()
display(returns_stats)

### Table 2: Correlation Results Table

In [None]:
# Table 2: Correlation Results Table (single stock template)
import pandas as pd
corr_table = pd.DataFrame({
    'Stock': ['AAPL'],
    'Correlation': [corr],
    'p-value': [pval]
})
display(corr_table)

---
### References
- [TextBlob Documentation](https://textblob.readthedocs.io/en/dev/)
- [Pearson Correlation (scipy)](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html)