In [61]:
import os
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr
from scipy.stats import ttest_ind
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))


# Correlation

In [3]:
# events = pd.read_csv(os.path.join(ROOT_DIR,"data","Ukraine_Black_Sea_2020_2023_Jan20.csv"), parse_dates=["EVENT_DATE"], index_col="EVENT_DATE")
# events = events[["FATALITIES","EVENT_TYPE","LOCATION","NOTES"]].loc["2022-01-01":"2022-12-31"].sort_index()
# # events = events.query("FATALITIES > 50")
# # events.iloc[50:100,:]
# events = events[["EVENT_TYPE","FATALITIES"]].rename(columns={"EVENT_DATE":"Date","EVENT_TYPE":"Count","FATALITIES":"Fatalities"})  # pegando somente colunas relevantes
# events = pd.get_dummies(events, columns=["Count"], prefix="", prefix_sep="")  # transformando em dummies
# events = events.resample("D").sum()
# # plot all columns in events
# events.plot(subplots=True, figsize=(15,15))


In [58]:
sources = ["AP","ABC","CBS","CNN","DailyMail","Express","Fox","Guardian","Mirror","NYT","Reuters"]
df_ntr = pd.read_csv(os.path.join(ROOT_DIR,"results_10","All_Results.csv"), parse_dates=["Date"], index_col="Date")
r_avg_system = df_ntr["Resonance"].mean()
n_avg_system = df_ntr["Novelty"].mean()
model = smf.ols(formula=f"Resonance ~ Novelty", data=df_ntr)
results = model.fit()
df_ntr[f"Resonance_Predicted"] = results.predict(df_ntr[f"Novelty"])
delta_r_avg_system = r_avg_system - df_ntr["Resonance_Predicted"].mean()

sources_dfs = []
for source in sources:
    df_filter = df_ntr.query(f"Source == '{source}'")
    r_avg = df_filter["Resonance"].mean()
    n_avg = df_filter["Novelty"].mean()
    delta_r_avg = r_avg - df_filter["Resonance_Predicted"].mean()
    diff_r_avg = r_avg - r_avg_system
    diff_n_avg = n_avg - n_avg_system
    diff_delta_r_avg = delta_r_avg - delta_r_avg_system
    t_results = source + " -> "
    for variable in ["Resonance","Novelty","Resonance_Predicted"]:
        t_statistic, p_value = ttest_ind(df_filter[variable], df_ntr[variable])
        if p_value < 0.01:
            t_results += f"{variable} (***) "
        elif p_value < 0.05:
            t_results += f"{variable} (**) "
        elif p_value < 0.1:
            t_results += f"{variable} (*) "
        else:
            t_results += f"{variable} () "
    print(t_results)
    print(r_avg_system)
    sources_dfs.append(pd.DataFrame({"Source":source,"diff_r_avg":diff_r_avg,"diff_n_avg":diff_n_avg,"diff_delta_r_avg":diff_delta_r_avg}, index=[0]))

df = pd.concat(sources_dfs, ignore_index=True)
df.round(4).head(11)

AP -> Resonance () Novelty (***) Resonance_Predicted (***) 
ABC -> Resonance () Novelty (***) Resonance_Predicted (***) 
CBS -> Resonance () Novelty () Resonance_Predicted () 
CNN -> Resonance () Novelty (***) Resonance_Predicted (***) 
DailyMail -> Resonance () Novelty (***) Resonance_Predicted (***) 
Express -> Resonance () Novelty (***) Resonance_Predicted (***) 
Fox -> Resonance () Novelty (***) Resonance_Predicted (***) 
Guardian -> Resonance () Novelty (***) Resonance_Predicted (***) 
Mirror -> Resonance () Novelty () Resonance_Predicted () 
NYT -> Resonance () Novelty (**) Resonance_Predicted (**) 
Reuters -> Resonance () Novelty (***) Resonance_Predicted (***) 


Unnamed: 0,Source,diff_r_avg,diff_n_avg,diff_delta_r_avg
0,AP,-0.0037,0.6715,-0.1787
1,ABC,0.0012,0.3408,-0.0876
2,CBS,0.0036,0.0206,-0.0018
3,CNN,0.0095,0.1845,-0.0386
4,DailyMail,0.0032,0.3072,-0.0769
5,Express,0.0093,-0.1994,0.0613
6,Fox,0.0017,-0.1596,0.0433
7,Guardian,-0.014,0.0546,-0.0282
8,Mirror,0.0053,-0.0003,0.0054
9,NYT,-0.0154,-0.0388,-0.0053


In [59]:
# create 3 new columns with z-score scaled values
df["z_diff_r_avg"] = (df["diff_r_avg"] - df["diff_r_avg"].mean()) / df["diff_r_avg"].std()
df["z_diff_n_avg"] = (df["diff_n_avg"] - df["diff_n_avg"].mean()) / df["diff_n_avg"].std()
df["z_diff_delta_r_avg"] = (df["diff_delta_r_avg"] - df["diff_delta_r_avg"].mean()) / df["diff_delta_r_avg"].std()
#limit all values in df to 4 decimal cases
df.round(4).head(11)


Unnamed: 0,Source,diff_r_avg,diff_n_avg,diff_delta_r_avg,z_diff_r_avg,z_diff_n_avg,z_diff_delta_r_avg
0,AP,-0.0037,0.6715,-0.1787,-0.4208,2.1122,-2.1405
1,ABC,0.0012,0.3408,-0.0876,0.1653,0.9309,-0.9038
2,CBS,0.0036,0.0206,-0.0018,0.4595,-0.2134,0.2633
3,CNN,0.0095,0.1845,-0.0386,1.1712,0.3724,-0.2368
4,DailyMail,0.0032,0.3072,-0.0769,0.4036,0.8108,-0.7579
5,Express,0.0093,-0.1994,0.0613,1.1496,-0.9994,1.12
6,Fox,0.0017,-0.1596,0.0433,0.2262,-0.8572,0.875
7,Guardian,-0.014,0.0546,-0.0282,-1.6639,-0.0919,-0.0966
8,Mirror,0.0053,-0.0003,0.0054,0.6618,-0.2882,0.3602
9,NYT,-0.0154,-0.0388,-0.0053,-1.8359,-0.4257,0.2147
