# Statistical Analysis

## Relevant Imports

In [89]:
import pandas as pd
import numpy as np
# Plot histograms for sentiment_score and Close price
import matplotlib.pyplot as plt
import statsmodels.tsa.stattools as ts
from statsmodels.tsa.stattools import grangercausalitytests
from collections import Counter
import re
from scipy.stats import pearsonr
from collections import Counter
import nltk
from nltk.corpus import stopwords
import gensim
from gensim import corpora

## Preprocessing

In [90]:
filepath_numerical = '../../dataset_final/FinSen_S&P500/S&P500_numerical.csv'
filepath_sentiment = '../../dataset_final/FinSen_S&P500/FinSen_text_annotated.csv'

In [91]:
data_num = pd.read_csv(filepath_numerical)
data_num.head()

Unnamed: 0,Date,Open,Close,High,Volume
0,2023-07-14,4514.6,4505.41,4527.75,3647450000.0
1,2023-07-13,4491.5,4510.04,4517.37,3839530000.0
2,2023-07-12,4467.68,4472.16,4488.33,3920290000.0
3,2023-07-11,4415.54,4439.25,4443.64,3624220000.0
4,2023-07-10,4394.22,4409.52,4412.6,3429600000.0


In [92]:
data_sent = pd.read_csv(filepath_sentiment)
data_sent.head()

Unnamed: 0.1,Unnamed: 0,Title,Tag,Time,Content,sentiment_score
0,0,"TSX Slightly Down, Books Weekly Gains",Stock Market,16/07/2023,"TSX Slightly Down, Books Weekly GainsUnited St...",0.13
1,1,UnitedHealth Hits 4-week High,stocks,15/07/2023,UnitedHealth Hits 4-week HighUnited States sto...,0.62
2,2,Cisco Systems Hits 4-week Low,stocks,15/07/2023,Cisco Systems Hits 4-week LowUnited States sto...,-0.43
3,3,AT&T Hits All-time Low,stocks,15/07/2023,AT&T Hits All-time LowUnited States stocksAT&T...,-0.28
4,4,Microsoft Hits 4-week High,stocks,15/07/2023,Microsoft Hits 4-week HighUnited States stocks...,0.6


In [93]:
if 'Unnamed: 0' in data_sent.columns:
    data_sent = data_sent.drop(columns=['Unnamed: 0'])
data_sent.rename(columns={'Time': 'Date'}, inplace=True)
data_sent.head()

Unnamed: 0,Title,Tag,Date,Content,sentiment_score
0,"TSX Slightly Down, Books Weekly Gains",Stock Market,16/07/2023,"TSX Slightly Down, Books Weekly GainsUnited St...",0.13
1,UnitedHealth Hits 4-week High,stocks,15/07/2023,UnitedHealth Hits 4-week HighUnited States sto...,0.62
2,Cisco Systems Hits 4-week Low,stocks,15/07/2023,Cisco Systems Hits 4-week LowUnited States sto...,-0.43
3,AT&T Hits All-time Low,stocks,15/07/2023,AT&T Hits All-time LowUnited States stocksAT&T...,-0.28
4,Microsoft Hits 4-week High,stocks,15/07/2023,Microsoft Hits 4-week HighUnited States stocks...,0.6


In [94]:
data_sent.value_counts

<bound method DataFrame.value_counts of                                                    Title               Tag  \
0                  TSX Slightly Down, Books Weekly Gains      Stock Market   
1                          UnitedHealth Hits 4-week High            stocks   
2                          Cisco Systems Hits 4-week Low            stocks   
3                                 AT&T Hits All-time Low            stocks   
4                             Microsoft Hits 4-week High            stocks   
...                                                  ...               ...   
15529  United States GDP Rises 0.6 percent in the fir...   GDP Growth Rate   
15530  Consumer Price Index 2.6 percent higher than i...    Inflation Rate   
15531  U.S. Federal Reserve Kept Rates Unchanged at 5...     Interest Rate   
15532              Trade Deficit Increases in March 2007  Balance of Trade   
15533  Blackstone boosts IPO after Beijing takes $3bn...              News   

             Date      

In [95]:
data_sent = data_sent.drop_duplicates(subset=['Title', 'Content'])
data_sent.value_counts

<bound method DataFrame.value_counts of                                                    Title               Tag  \
0                  TSX Slightly Down, Books Weekly Gains      Stock Market   
1                          UnitedHealth Hits 4-week High            stocks   
2                          Cisco Systems Hits 4-week Low            stocks   
3                                 AT&T Hits All-time Low            stocks   
4                             Microsoft Hits 4-week High            stocks   
...                                                  ...               ...   
15529  United States GDP Rises 0.6 percent in the fir...   GDP Growth Rate   
15530  Consumer Price Index 2.6 percent higher than i...    Inflation Rate   
15531  U.S. Federal Reserve Kept Rates Unchanged at 5...     Interest Rate   
15532              Trade Deficit Increases in March 2007  Balance of Trade   
15533  Blackstone boosts IPO after Beijing takes $3bn...              News   

             Date      

## Merge Dataframes

In [96]:
# Aggregate sentiment scores by summing them up for each date
data_sent_aggregated = (
    data_sent.groupby('Date', as_index=False)
    .agg({'sentiment_score': 'sum'})
)

# Round the sentiment_score to two decimal places
data_sent_aggregated['sentiment_score'] = data_sent_aggregated['sentiment_score'].round(2)

# Merge aggregated scores back into the original DataFrame
data_sent = data_sent.drop(columns=['sentiment_score'])  # Drop original sentiment_score column
data_sent = data_sent.merge(data_sent_aggregated, on='Date', how='left')  # Add aggregated column

print(data_sent)


                                                   Title               Tag  \
0                  TSX Slightly Down, Books Weekly Gains      Stock Market   
1                          UnitedHealth Hits 4-week High            stocks   
2                          Cisco Systems Hits 4-week Low            stocks   
3                                 AT&T Hits All-time Low            stocks   
4                             Microsoft Hits 4-week High            stocks   
...                                                  ...               ...   
15478  United States GDP Rises 0.6 percent in the fir...   GDP Growth Rate   
15479  Consumer Price Index 2.6 percent higher than i...    Inflation Rate   
15480  U.S. Federal Reserve Kept Rates Unchanged at 5...     Interest Rate   
15481              Trade Deficit Increases in March 2007  Balance of Trade   
15482  Blackstone boosts IPO after Beijing takes $3bn...              News   

             Date                                            Co

In [97]:
# Ensure Date columns are in datetime format
data_num['Date'] = pd.to_datetime(data_num['Date'])
data_sent['Date'] = pd.to_datetime(data_sent['Date'])

# Merge the dataframes on the 'Date' column
data_merged = pd.merge(data_num, data_sent, on='Date', how='inner')
# After this step, df_merged will have:
# Date, Title, Content, Tag, sentiment_score, Open, High, Close, Volume, etc.


  data_sent['Date'] = pd.to_datetime(data_sent['Date'])


In [98]:
data_merged.head()

Unnamed: 0,Date,Open,Close,High,Volume,Title,Tag,Content,sentiment_score
0,2023-07-14,4514.6,4505.41,4527.75,3647450000.0,US Budget Deficit Widens More than Expected in...,Government Budget Value,US Budget Deficit Widens More than Expected in...,1.02
1,2023-07-14,4514.6,4505.41,4527.75,3647450000.0,Visa Hits 24-week High,stocks,Visa Hits 24-week HighUnited States stocksVisa...,1.02
2,2023-07-14,4514.6,4505.41,4527.75,3647450000.0,Amazon Hits 43-week High,stocks,Amazon Hits 43-week HighUnited States stocksAm...,1.02
3,2023-07-14,4514.6,4505.41,4527.75,3647450000.0,10-Year Treasury Yield Falls for 4th Session,Government Bond 10Y,10-Year Treasury Yield Falls for 4th SessionUn...,1.02
4,2023-07-14,4514.6,4505.41,4527.75,3647450000.0,DXY Approaches 100,Currency,DXY Approaches 100United States CurrencyThe do...,1.02


In [99]:
# Ensure filtered_data is a full copy of data_merged
filtered_data = data_merged.copy()

# Merge all rows for the same date into one row
merged_data = filtered_data.groupby('Date', as_index=False).agg({
    'Open': 'first',
    'Close': 'first',
    'High': 'first',
    'Volume': 'sum',  # Summing volumes for the same date
    'Title': lambda x: ', '.join(x),  # Joining titles with commas
    'Tag': lambda x: ', '.join(x),  # Joining tags with commas
    'Content': lambda x: ', '.join(x),  # Joining content with commas
    'sentiment_score': 'first'  # Assuming sentiment score is already aggregated
})

# Display or save the result
merged_data.head(10)


Unnamed: 0,Date,Open,Close,High,Volume,Title,Tag,Content,sentiment_score
0,2007-06-04,1536.28,1539.18,1540.53,5477860000.0,"Trade Deficit Increases in March 2007, Blackst...","Balance of Trade, News",Trade Deficit Increases in March 2007United St...,0.03
1,2007-06-06,1530.56,1517.38,1530.56,14820950000.0,"Nonfarm payroll employment increased by 157,00...","Unemployment Rate, Balance of Trade, GDP Growt...","Nonfarm payroll employment increased by 157,00...",0.01
2,2007-06-21,1512.5,1522.18,1522.9,3161110000.0,US Economy is expanding after Q1 slowdown,GDP Growth Rate,US Economy is expanding after Q1 slowdownUnite...,0.03
3,2007-06-26,1497.68,1492.89,1506.11,3398530000.0,Consumer Confidence and Housing Weaken in US,GDP Growth Rate,Consumer Confidence and Housing Weaken in USUn...,-0.66
4,2007-06-28,1506.31,1505.7,1514.83,3006710000.0,Fed keeps the federal funds rate at 5-1/4 percent,Interest Rate,Fed keeps the federal funds rate at 5-1/4 perc...,0.16
5,2007-07-05,1524.85,1525.4,1526.56,2622950000.0,U.S. ISM Services Index Reached 14-Month High ...,News,U.S. ISM Services Index Reached 14-Month High ...,0.66
6,2007-07-06,1524.95,1530.43,1532.4,2441520000.0,US June unemployment rate unchanged at 4.5%,Unemployment Rate,US June unemployment rate unchanged at 4.5%Uni...,-0.02
7,2007-07-10,1531.84,1510.11,1531.84,3244280000.0,"U.S. Growth, Easing Inflation to Follow Fed `S...",Inflation Rate,"U.S. Growth, Easing Inflation to Follow Fed `S...",0.0
8,2007-07-12,1518.73,1547.69,1547.92,3489600000.0,US trade deficit increases to $60.0 billion in...,Balance of Trade,US trade deficit increases to $60.0 billion in...,-0.02
9,2007-07-18,1549.19,1546.17,1549.19,3609220000.0,Fed chief acknowledges credit fears,Stock Market,Fed chief acknowledges credit fearsUnited Stat...,-0.11


## Movement calculation - Open (t+1) - Close (t)

In [100]:
merged_data = merged_data.sort_index()

# Calculate next-day return
merged_data['Movement'] = ((merged_data['Open'].shift(-1) - merged_data['Close']) / merged_data['Close'])

# Drop the last row since it doesn't have a future value
merged_data = merged_data.dropna(subset=['Movement'])

In [101]:
merged_data.head()

Unnamed: 0,Date,Open,Close,High,Volume,Title,Tag,Content,sentiment_score,Movement
0,2007-06-04,1536.28,1539.18,1540.53,5477860000.0,"Trade Deficit Increases in March 2007, Blackst...","Balance of Trade, News",Trade Deficit Increases in March 2007United St...,0.03,-0.0056
1,2007-06-06,1530.56,1517.38,1530.56,14820950000.0,"Nonfarm payroll employment increased by 157,00...","Unemployment Rate, Balance of Trade, GDP Growt...","Nonfarm payroll employment increased by 157,00...",0.01,-0.003216
2,2007-06-21,1512.5,1522.18,1522.9,3161110000.0,US Economy is expanding after Q1 slowdown,GDP Growth Rate,US Economy is expanding after Q1 slowdownUnite...,0.03,-0.016095
3,2007-06-26,1497.68,1492.89,1506.11,3398530000.0,Consumer Confidence and Housing Weaken in US,GDP Growth Rate,Consumer Confidence and Housing Weaken in USUn...,-0.66,0.008989
4,2007-06-28,1506.31,1505.7,1514.83,3006710000.0,Fed keeps the federal funds rate at 5-1/4 percent,Interest Rate,Fed keeps the federal funds rate at 5-1/4 perc...,0.16,0.012718


In [102]:
# Assuming df already has Next_Day_Return and is cleaned
# List of features you want to analyze
feature_cols = ['Open', 'Close', 'High', 'Volume', 'sentiment_score']

# Calculate the correlation matrix for all features including Next_Day_Return
corr_matrix = merged_data[feature_cols + ['Movement']].corr()

# Print the correlation of each feature with Next_Day_Return
print("Correlation with Next_Day_Return:")
for col in feature_cols:
    print(f"{col}: {corr_matrix.loc[col, 'Movement']:.4f}")


Correlation with Next_Day_Return:
Open: 0.0129
Close: 0.0110
High: 0.0118
Volume: -0.0119
sentiment_score: -0.0094


## Feature Engineering

### 1. Lagged Features

In [103]:
# Create lagged features (e.g., 1-day lag)
merged_data['Open_lag1'] = merged_data['Open'].shift(1)
merged_data['Close_lag1'] = merged_data['Close'].shift(1)
merged_data['High_lag1'] = merged_data['High'].shift(1)
merged_data['Volume_lag1'] = merged_data['Volume'].shift(1)
merged_data['Sentiment_lag1'] = merged_data['sentiment_score'].shift(1)

# Drop rows introduced by shifting at the start of dataset
merged_data = merged_data.dropna(subset=['Open_lag1', 'Close_lag1', 'High_lag1', 'Volume_lag1', 'Sentiment_lag1', 'Movement'])


In [104]:
# Create lagged features (e.g., 2-day lag)
merged_data['Open_lag2'] = merged_data['Open'].shift(2)
merged_data['Close_lag2'] = merged_data['Close'].shift(2)
merged_data['High_lag2'] = merged_data['High'].shift(2)
merged_data['Volume_lag2'] = merged_data['Volume'].shift(2)
merged_data['Sentiment_lag2'] = merged_data['sentiment_score'].shift(2)

# Drop rows introduced by shifting at the start of dataset
merged_data = merged_data.dropna(subset=['Open_lag2', 'Close_lag2', 'High_lag2', 'Volume_lag2', 'Sentiment_lag2', 'Movement'])


In [105]:
# Create lagged features (e.g., 3-day lag)
merged_data['Open_lag3'] = merged_data['Open'].shift(3)
merged_data['Close_lag3'] = merged_data['Close'].shift(3)
merged_data['High_lag3'] = merged_data['High'].shift(3)
merged_data['Volume_lag3'] = merged_data['Volume'].shift(3)
merged_data['Sentiment_lag3'] = merged_data['sentiment_score'].shift(3)

# Drop rows introduced by shifting at the start of dataset
merged_data = merged_data.dropna(subset=['Open_lag3', 'Close_lag3', 'High_lag3', 'Volume_lag3', 'Sentiment_lag3', 'Movement'])


In [106]:
# Update feature list with newly created features
new_features = ['Open_lag1', 'Close_lag1', 'High_lag1', 'Volume_lag1', 'Sentiment_lag1','Open_lag2', 'Close_lag2', 'High_lag2', 'Volume_lag2', 'Sentiment_lag2','Open_lag3', 'Close_lag3', 'High_lag3', 'Volume_lag3', 'Sentiment_lag3']

# Compute correlation matrix again
corr_matrix = merged_data[new_features + ['Movement']].corr()

print("Correlation with Movement:")
for f in new_features:
    print(f"{f}: {corr_matrix.loc[f, 'Movement']:.4f}")


Correlation with Movement:
Open_lag1: 0.0116
Close_lag1: 0.0129
High_lag1: 0.0117
Volume_lag1: 0.0079
Sentiment_lag1: -0.0026
Open_lag2: 0.0112
Close_lag2: 0.0110
High_lag2: 0.0109
Volume_lag2: -0.0052
Sentiment_lag2: 0.0030
Open_lag3: 0.0120
Close_lag3: 0.0115
High_lag3: 0.0117
Volume_lag3: 0.0210
Sentiment_lag3: -0.0113


### 2. Rolling Averages

In [107]:
# 5-day windows
merged_data['Open_5d_SMA'] = merged_data['Open'].rolling(window=5).mean()
merged_data['Close_5d_SMA'] = merged_data['Close'].rolling(window=5).mean()
merged_data['High_5d_SMA'] = merged_data['High'].rolling(window=5).mean()
merged_data['Volume_5d_SMA'] = merged_data['Volume'].rolling(window=5).mean()
merged_data['Sentiment_5d_SMA'] = merged_data['sentiment_score'].rolling(window=5).mean()

# 7-day windows
merged_data['Open_7d_SMA'] = merged_data['Open'].rolling(window=7).mean()
merged_data['Close_7d_SMA'] = merged_data['Close'].rolling(window=7).mean()
merged_data['High_7d_SMA'] = merged_data['High'].rolling(window=7).mean()
merged_data['Volume_7d_SMA'] = merged_data['Volume'].rolling(window=7).mean()
merged_data['Sentiment_7d_SMA'] = merged_data['sentiment_score'].rolling(window=7).mean()

# 7-day windows
merged_data['Open_14d_SMA'] = merged_data['Open'].rolling(window=14).mean()
merged_data['Close_14d_SMA'] = merged_data['Close'].rolling(window=14).mean()
merged_data['High_14d_SMA'] = merged_data['High'].rolling(window=14).mean()
merged_data['Volume_14d_SMA'] = merged_data['Volume'].rolling(window=14).mean()
merged_data['Sentiment_14d_SMA'] = merged_data['sentiment_score'].rolling(window=14).mean()


# Drop rows with NaN due to rolling at dataset start
merged_data = merged_data.dropna(subset=['Open_5d_SMA', 'Close_5d_SMA', 'High_5d_SMA', 'Volume_5d_SMA', 'Sentiment_5d_SMA', 'Open_7d_SMA', 'Close_7d_SMA', 'High_7d_SMA', 'Volume_7d_SMA', 'Sentiment_7d_SMA', 'Open_14d_SMA', 'Close_14d_SMA', 'High_14d_SMA', 'Volume_14d_SMA', 'Sentiment_14d_SMA', 'Movement'])


In [108]:
# Update feature list with newly created features
new_features = ['Open_5d_SMA', 'Close_5d_SMA', 'High_5d_SMA', 'Volume_5d_SMA', 'Sentiment_5d_SMA', 'Open_7d_SMA', 'Close_7d_SMA', 'High_7d_SMA', 'Volume_7d_SMA', 'Sentiment_7d_SMA', 'Open_14d_SMA', 'Close_14d_SMA', 'High_14d_SMA', 'Volume_14d_SMA', 'Sentiment_14d_SMA']

# Compute correlation matrix again
corr_matrix = merged_data[new_features + ['Movement']].corr()

print("Correlation with Movement:")
for f in new_features:
    print(f"{f}: {corr_matrix.loc[f, 'Movement']:.4f}")

Correlation with Movement:
Open_5d_SMA: 0.0109
Close_5d_SMA: 0.0108
High_5d_SMA: 0.0106
Volume_5d_SMA: 0.0059
Sentiment_5d_SMA: -0.0014
Open_7d_SMA: 0.0111
Close_7d_SMA: 0.0110
High_7d_SMA: 0.0108
Volume_7d_SMA: 0.0029
Sentiment_7d_SMA: -0.0006
Open_14d_SMA: 0.0103
Close_14d_SMA: 0.0104
High_14d_SMA: 0.0101
Volume_14d_SMA: 0.0065
Sentiment_14d_SMA: 0.0058


### 3. Rolling Volatility

In [109]:
merged_data['Open_10d_STD'] = merged_data['Open'].rolling(window=10).std()
merged_data['Close_10d_STD'] = merged_data['Close'].rolling(window=10).std()
merged_data['High_10d_STD'] = merged_data['High'].rolling(window=10).std()
merged_data['Volume_10d_STD'] = merged_data['Volume'].rolling(window=10).std()
merged_data['Sentiment_10d_STD'] = merged_data['sentiment_score'].rolling(window=10).std()

# Drop rows with NaN due to rolling
merged_data = merged_data.dropna(subset=['Open_10d_STD', 'Close_10d_STD', 'High_10d_STD', 'Volume_10d_STD', 'Sentiment_10d_STD', 'Movement'])


In [110]:
# Update feature list with newly created features
new_features = ['Open_10d_STD', 'Close_10d_STD', 'High_10d_STD', 'Volume_10d_STD', 'Sentiment_10d_STD']

# Compute correlation matrix again
corr_matrix = merged_data[new_features + ['Movement']].corr()

print("Correlation with Movement:")
for f in new_features:
    print(f"{f}: {corr_matrix.loc[f, 'Movement']:.4f}")

Correlation with Movement:
Open_10d_STD: -0.0221
Close_10d_STD: -0.0169
High_10d_STD: -0.0230
Volume_10d_STD: 0.0098
Sentiment_10d_STD: 0.0205


## Causality Testing

### 1. Stationarity Testing

In [111]:
# Let's test stationarity on the 'Movement' series
movement_series = merged_data['Movement'].dropna()

adf_result = ts.adfuller(movement_series, autolag='AIC')

print("ADF Statistic: ", adf_result[0])
print("p-value: ", adf_result[1])
for key, value in adf_result[4].items():
    print('Critical Values:')
    print(f'   {key}, {value}')


ADF Statistic:  -9.974300468170076
p-value:  2.198310053058716e-17
Critical Values:
   1%, -3.432718083748152
Critical Values:
   5%, -2.862586250752157
Critical Values:
   10%, -2.5673269523447004


In [112]:
# Let's test stationarity on the 'sentiment' series
movement_series = merged_data['sentiment_score'].dropna()

adf_result = ts.adfuller(movement_series, autolag='AIC')

print("ADF Statistic: ", adf_result[0])
print("p-value: ", adf_result[1])
for key, value in adf_result[4].items():
    print('Critical Values:')
    print(f'   {key}, {value}')


ADF Statistic:  -7.101849497439868
p-value:  4.1484705702213904e-10
Critical Values:
   1%, -3.432716369874773
Critical Values:
   5%, -2.8625854938408413
Critical Values:
   10%, -2.5673265493658404


### 2. Granger Causality Testing

In [113]:
merged_data.columns

Index(['Date', 'Open', 'Close', 'High', 'Volume', 'Title', 'Tag', 'Content',
       'sentiment_score', 'Movement', 'Open_lag1', 'Close_lag1', 'High_lag1',
       'Volume_lag1', 'Sentiment_lag1', 'Open_lag2', 'Close_lag2', 'High_lag2',
       'Volume_lag2', 'Sentiment_lag2', 'Open_lag3', 'Close_lag3', 'High_lag3',
       'Volume_lag3', 'Sentiment_lag3', 'Open_5d_SMA', 'Close_5d_SMA',
       'High_5d_SMA', 'Volume_5d_SMA', 'Sentiment_5d_SMA', 'Open_7d_SMA',
       'Close_7d_SMA', 'High_7d_SMA', 'Volume_7d_SMA', 'Sentiment_7d_SMA',
       'Open_14d_SMA', 'Close_14d_SMA', 'High_14d_SMA', 'Volume_14d_SMA',
       'Sentiment_14d_SMA', 'Open_10d_STD', 'Close_10d_STD', 'High_10d_STD',
       'Volume_10d_STD', 'Sentiment_10d_STD'],
      dtype='object')

In [114]:
# Drop NA values
merged_data = merged_data.dropna(subset=['sentiment_score', 'Movement'])

# Prepare data: [Affected_Series (Movement), Causing_Series (sentiment_score)]
test_data = merged_data[['Movement', 'sentiment_score']]

# Perform Granger causality test for lags from 1 to 3
max_lag = 3
granger_results = grangercausalitytests(test_data, maxlag=max_lag, verbose=True)



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.0137  , p=0.9067  , df_denom=2781, df_num=1
ssr based chi2 test:   chi2=0.0138  , p=0.9066  , df=1
likelihood ratio test: chi2=0.0138  , p=0.9066  , df=1
parameter F test:         F=0.0137  , p=0.9067  , df_denom=2781, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.1332  , p=0.8753  , df_denom=2778, df_num=2
ssr based chi2 test:   chi2=0.2669  , p=0.8751  , df=2
likelihood ratio test: chi2=0.2669  , p=0.8751  , df=2
parameter F test:         F=0.1332  , p=0.8753  , df_denom=2778, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.3292  , p=0.8043  , df_denom=2775, df_num=3
ssr based chi2 test:   chi2=0.9900  , p=0.8037  , df=3
likelihood ratio test: chi2=0.9898  , p=0.8037  , df=3
parameter F test:         F=0.3292  , p=0.8043  , df_denom=2775, df_num=3




No Causality between sentiment score and Movement.

## Causality Testing for Volatility

In [115]:
df = merged_data.copy()

In [130]:
# Assume df has columns: ['Close', 'Aggregate_Sentiment']
# Sort by date if not already
df = df.sort_index()

# Calculate daily returns (in percentage)
df['Daily_Return'] = df['Close'].pct_change() * 100

# Choose a rolling window (e.g., 10 days) for volatility
window_size = 5
df['Volatility'] = df['Daily_Return'].rolling(window=window_size).std()

# Drop the initial NaNs from rolling calculation
df = df.dropna(subset=['Volatility'])


In [131]:
df = df.dropna(subset=['sentiment_score', 'Volatility'])

In [132]:
from statsmodels.tsa.stattools import grangercausalitytests

max_lag = 3
significance_level = 0.05


In [133]:
test_data_1 = df[['Volatility', 'sentiment_score']]
results_1 = grangercausalitytests(test_data_1, maxlag=max_lag, verbose=False)

# Check if any lag shows significance
sent_cause_vol = False
for lag in range(1, max_lag+1):
    p_value = results_1[lag][0]['ssr_ftest'][1]
    if p_value < significance_level:
        sent_cause_vol = True
        print(f"At lag {lag}, p-value={p_value:.7f}, sentiment_score Granger causes Volatility.")
        break

if not sent_cause_vol:
    print("No evidence that Aggregate_Sentiment Granger causes Volatility at tested lags.")


At lag 1, p-value=0.0000455, sentiment_score Granger causes Volatility.




In [134]:
test_data_2 = df[['sentiment_score', 'Volatility']]
results_2 = grangercausalitytests(test_data_2, maxlag=max_lag, verbose=False)

# Check if any lag shows significance
vol_cause_sent = False
for lag in range(1, max_lag+1):
    p_value = results_2[lag][0]['ssr_ftest'][1]
    if p_value < significance_level:
        vol_cause_sent = True
        print(f"At lag {lag}, p-value={p_value:.7f}, Volatility Granger causes sentiment_score.")
        break

if not vol_cause_sent:
    print("No evidence that Volatility Granger causes Aggregate_Sentiment at tested lags.")


At lag 1, p-value=0.0000000, Volatility Granger causes sentiment_score.




In [135]:
test_data_1 = df[['Volatility', 'Movement']]
results_1 = grangercausalitytests(test_data_1, maxlag=max_lag, verbose=False)

# Check if any lag shows significance
sent_cause_vol = False
for lag in range(1, max_lag+1):
    p_value = results_1[lag][0]['ssr_ftest'][1]
    if p_value < significance_level:
        sent_cause_vol = True
        print(f"At lag {lag}, p-value={p_value:.7f}, Movement Granger causes Volatility.")
        break

if not sent_cause_vol:
    print("No evidence that Aggregate_Sentiment Granger causes Volatility at tested lags.")


At lag 1, p-value=0.0410548, Movement Granger causes Volatility.




In [136]:
test_data_1 = df[['Movement','Volatility']]
results_1 = grangercausalitytests(test_data_1, maxlag=max_lag, verbose=False)

# Check if any lag shows significance
sent_cause_vol = False
for lag in range(1, max_lag+1):
    p_value = results_1[lag][0]['ssr_ftest'][1]
    if p_value < significance_level:
        sent_cause_vol = True
        print(f"At lag {lag}, p-value={p_value:.7f}, Volatility Granger causes Movement.")
        break

if not sent_cause_vol:
    print("No evidence that Movement Granger causes Volatility at tested lags.")


At lag 1, p-value=0.0003909, Volatility Granger causes Movement.




Sentiment Score Granger Causes Volatility which Granger Causes Movement. since, there is no direct causality between sentiment score and Movement, we can use Volatility as an added feature.