### Reading in data of post sentiment from an s3 bucket

In [2]:
import boto3
import io
import pandas as pd

s3 = boto3.client('s3')
s3_resource = boto3.resource('s3')
bucket = s3_resource.Bucket('pranathi-sentiment')



object_key = 'final_classification.csv'
bucket_name = 'pranathi-sentiment'
csv_obj = s3.get_object(Bucket=bucket_name, Key=object_key)
body = csv_obj['Body']
csv_string = body.read().decode('utf-8')

df = pd.read_csv(io.StringIO(csv_string))
print(df)

          class         date
0       neutral  2021-05-24 
1       neutral  2021-05-24 
2      positive  2021-05-24 
3      negative  2021-05-24 
4       neutral  2021-05-24 
...         ...          ...
10523   neutral  2022-05-28 
10524   neutral  2022-05-28 
10525   neutral  2022-05-28 
10526   neutral  2022-05-28 
10527   neutral  2022-05-28 

[10528 rows x 2 columns]


### Transforming categorical data (sentiment)

In [3]:
def transform(x):
    if x =='neutral':
        return 0
    elif x=='positive':
        return 1
    else:
        return -1
    

In [4]:
df['class'] = df['class'].apply(transform)
final_df = df.groupby('date').mean()
final_df.reset_index(inplace = True)

### Reading in Historical S&P index data from an s3 bucket (for the same timeline as the scraped posts)

In [5]:
import boto3
import io

s3 = boto3.client('s3')
s3_resource = boto3.resource('s3')
bucket = s3_resource.Bucket('pranathi-sentiment')



object_key = 'Historical_S&p.csv'
bucket_name = 'pranathi-sentiment'
csv_obj = s3.get_object(Bucket=bucket_name, Key=object_key)
body = csv_obj['Body']
csv_string = body.read().decode('utf-8')

df_1 = pd.read_csv(io.StringIO(csv_string))
df_1['change'] = (df_1['Close/Last'] - df_1['Open'])/df_1['Open']

In [6]:
import time
#df['conv_date'] = df['Date'].apply(lambda x: time.strftime("%Y-%m-%d ", time.gmtime(x)))
df_1['Date'] = pd.to_datetime(df_1['Date']).dt.strftime('%Y-%m-%d ') 
df_1.rename(columns = {'Date':'date'}, inplace = True)
final_corr_df = pd.merge(final_df, df_1, how = 'inner', on = 'date')

### Performing granger causality test on S&P index performance and the average sentiment of posts per day

In [8]:
from statsmodels.tsa.stattools import grangercausalitytests

In [27]:
grangercausalitytests(final_corr_df[['class', 'change']], maxlag=[3])


Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.6736  , p=0.5690  , df_denom=237, df_num=3
ssr based chi2 test:   chi2=2.0806  , p=0.5558  , df=3
likelihood ratio test: chi2=2.0718  , p=0.5576  , df=3
parameter F test:         F=0.6736  , p=0.5690  , df_denom=237, df_num=3


{3: ({'ssr_ftest': (0.6736436634219433, 0.5689688173118932, 237.0, 3),
   'ssr_chi2test': (2.080620935126002, 0.5558432182085462, 3),
   'lrtest': (2.0718001757477396, 0.5576388127470482, 3),
   'params_ftest': (0.6736436634219584, 0.5689688173118875, 237.0, 3.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fede25f44c0>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fede25f4f10>,
   array([[0., 0., 0., 1., 0., 0., 0.],
          [0., 0., 0., 0., 1., 0., 0.],
          [0., 0., 0., 0., 0., 1., 0.]])])}

In [25]:
grangercausalitytests(final_corr_df[['change', 'class']], maxlag=[3])


Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.3474  , p=0.7911  , df_denom=237, df_num=3
ssr based chi2 test:   chi2=1.0730  , p=0.7836  , df=3
likelihood ratio test: chi2=1.0706  , p=0.7842  , df=3
parameter F test:         F=0.3474  , p=0.7911  , df_denom=237, df_num=3


{3: ({'ssr_ftest': (0.34740529763178796, 0.7910615544027828, 237.0, 3),
   'ssr_chi2test': (1.0729986407867882, 0.7835959894605385, 3),
   'lrtest': (1.0706462599564475, 0.7841644964270014, 3),
   'params_ftest': (0.34740529763177436, 0.7910615544027957, 237.0, 3.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fede25f4c10>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fede25f47f0>,
   array([[0., 0., 0., 1., 0., 0., 0.],
          [0., 0., 0., 0., 1., 0., 0.],
          [0., 0., 0., 0., 0., 1., 0.]])])}

In [22]:
grangercausalitytests(final_corr_df[['class', 'Close/Last']], maxlag=[1])


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=4.4929  , p=0.0351  , df_denom=243, df_num=1
ssr based chi2 test:   chi2=4.5483  , p=0.0330  , df=1
likelihood ratio test: chi2=4.5068  , p=0.0338  , df=1
parameter F test:         F=4.4929  , p=0.0351  , df_denom=243, df_num=1


{1: ({'ssr_ftest': (4.492854266511155, 0.035051018224726505, 243.0, 1),
   'ssr_chi2test': (4.54832160313475, 0.03295103145340592, 1),
   'lrtest': (4.506785583831743, 0.03376063007888898, 1),
   'params_ftest': (4.492854266511122, 0.03505101822472738, 243.0, 1.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fede25ea280>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fede25ea250>,
   array([[0., 1., 0.]])])}

In [28]:
grangercausalitytests(final_corr_df[['class', 'Close/Last']], maxlag=[3])


Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.8438  , p=0.1399  , df_denom=237, df_num=3
ssr based chi2 test:   chi2=5.6949  , p=0.1274  , df=3
likelihood ratio test: chi2=5.6295  , p=0.1311  , df=3
parameter F test:         F=1.8438  , p=0.1399  , df_denom=237, df_num=3


{3: ({'ssr_ftest': (1.8438389714675345, 0.13987319469917028, 237.0, 3),
   'ssr_chi2test': (5.694895051114917, 0.1274354571538159, 3),
   'lrtest': (5.629452698972045, 0.13109769932162563, 3),
   'params_ftest': (1.8438389714674843, 0.13987319469918022, 237.0, 3.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fede25f4370>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fede25f4250>,
   array([[0., 0., 0., 1., 0., 0., 0.],
          [0., 0., 0., 0., 1., 0., 0.],
          [0., 0., 0., 0., 0., 1., 0.]])])}

In [23]:
grangercausalitytests(final_corr_df[['Close/Last', 'class']], maxlag=[1])


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.7411  , p=0.3902  , df_denom=243, df_num=1
ssr based chi2 test:   chi2=0.7503  , p=0.3864  , df=1
likelihood ratio test: chi2=0.7491  , p=0.3868  , df=1
parameter F test:         F=0.7411  , p=0.3902  , df_denom=243, df_num=1


{1: ({'ssr_ftest': (0.741109895932386, 0.39015418750858133, 243.0, 1),
   'ssr_chi2test': (0.7502594008204401, 0.3863941153663569, 1),
   'lrtest': (0.7491176380140132, 0.38675573565356447, 1),
   'params_ftest': (0.7411098959319826, 0.39015418750871145, 243.0, 1.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fede25eac40>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fede25f4c40>,
   array([[0., 1., 0.]])])}