In [13]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import f
import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests

In [26]:
v_d = pd.read_csv("/content/vaccinations.csv")

In [27]:
grouped_data = v_d.groupby('date')
vac_data = grouped_data.agg({'location': 'count', 'daily_vaccinations': 'sum'}).reset_index()
vac_data = vac_data.rename(columns={'location': 'counties_count'})

In [28]:
t_d = pd.read_csv("/content/vaccination_all_tweets.csv")

In [29]:
t_d['date'] = pd.to_datetime(t_d['date'], format='%Y-%m-%d %H:%M:%S')

In [30]:
t_d = (t_d.groupby(t_d['date'].dt.date))
t_d = t_d.agg({'retweets': 'sum'})
t_d = t_d.reset_index()

In [31]:
df = pd.DataFrame(vac_data)
df["tweets"] = t_d["retweets"]

In [19]:
def autoregression(data_series, order, forecast_amount):
    coefficients = np.ones(order)
    lagged_matrix = np.array([data_series[i:order + i] for i in range(len(data_series) - order - 1)])
    coefficients = np.linalg.lstsq(lagged_matrix, data_series[order + 1:len(data_series)], rcond=None)[0]

    data_list = data_series.values.tolist()

    for _ in range(forecast_amount):
        prediction = np.dot(coefficients, data_list[-order:])
        data_list.append(prediction)

    return np.array(data_list)

def combined_autoregression(X_series, Y_series, order_x, order_y, forecast_amount):
    combined_order = order_x + order_y
    combined_coefficients = np.ones(combined_order)

    X_lagged_matrix = np.array([X_series[i:order_x + i] for i in range(len(X_series) - order_x - 1)])
    Y_lagged_matrix = np.array([Y_series[i:order_y + i] for i in range(len(Y_series) - order_y - 1)])

    combined_matrix = np.column_stack((X_lagged_matrix, Y_lagged_matrix))
    combined_coefficients = np.linalg.lstsq(combined_matrix, X_series[order_x + 1:len(X_series)], rcond=None)[0]

    X_list = X_series.values.tolist()
    Y_list = Y_series.values.tolist()

    for _ in range(forecast_amount):
        combined_pred = np.dot(combined_coefficients, X_list[-order_x:] + Y_list[-order_y:])
        X_list.append(combined_pred)

    return np.array(X_list)


In [32]:
daily_vac = df["daily_vaccinations"]
daily_tweet = df["tweets"]
predict_range = 5
lag = len(daily_vac) - predict_range - 2
pred = autoregression(daily_vac[:int(len(daily_vac)) - predict_range], lag ,predict_range)


In [33]:
def granger_causality_test(y_with_lag_residuals, y_without_lag_residuals, lag_order, regression_coefficients, n):

    mse_with_lag = np.mean(y_with_lag_residuals ** 2)
    mse_without_lag = np.mean(y_without_lag_residuals ** 2)

    f_statistic = (abs(mse_without_lag - mse_with_lag) / lag_order) / (mse_with_lag / abs(n - lag_order - regression_coefficients))
    p_value = f.sf(f_statistic, lag_order, n - lag_order - regression_coefficients)

    return f_statistic, p_value


In [34]:
lag_vac = len(daily_vac) - predict_range - 2
predicted_lag_vac = combined_autoregression(daily_vac[:int(len(daily_vac)) - predict_range], daily_tweet[:int(len(daily_vac)) - predict_range], lag_vac, lag_vac, predict_range)

In [35]:
f_statistic, p_value = granger_causality_test(pred, predicted_lag_vac, lag_vac, 2, len(pred)-1)
critical = f.ppf(0.95, 2, len(pred)-1)
print(f_statistic)

6.223180866872984e-13


In [37]:
print(p_value)

0.9999999999999999


Так как p_value > 0.05, то мы не отвергаем нулевую гипотезу

In [24]:
daily_vac = df["daily_vaccinations"]

predict_range = 5
lag_tweet = len(daily_vac) - predict_range - 2

daily_tweet = df["tweets"]
predicted_lag_tweet = combined_autoregression(daily_tweet[:int(len(daily_vac)) - predict_range], daily_vac[:int(len(daily_vac)) - predict_range], lag_tweet, lag_tweet, predict_range)

In [38]:
f_statistic, p_value = granger_causality_test(pred, predicted_lag_tweet, lag_tweet, 2, len(pred)-1)
critical = f.ppf(0.95, 2, len(pred)-1)

In [39]:
print(f_statistic)

0.015209125460696696


In [40]:
print(p_value)

0.9999999999999999


Так как p_value > 0.05, то мы не отвергаем нулевую гипотезу