In [15]:
import os 
import json
from collections import defaultdict
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt

In [16]:
from causalimpact import CausalImpact

In [14]:
# merge json files
def merge_json_files(folder_path, output_file):
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

    merged_data = []

    for file in json_files:
        file_path = os.path.join(folder_path, file)
        with open(file_path, 'r', encoding='utf-8') as json_file:
            data = json.load(json_file)
            merged_data.extend(data)  # 合并所有文件的数据到一个列表中

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(merged_data, f, ensure_ascii=False, indent=4)


In [None]:
merge_json_files('sentiment_scores_json', 'all_sentiment_score.json')

In [None]:
# calculate monthly average
def calculate_monthly_average(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    monthly_scores = defaultdict(list)

    for entry in data:
        date = datetime.strptime(entry['date'], '%Y-%m-%d')
        year_month = date.strftime('%Y-%m')
        monthly_scores[year_month].append(int(entry['sentiment_score']))  # 将分数从string转化为int

    monthly_averages = {k: round(sum(v) / len(v), 2) for k, v in monthly_scores.items()}  # 取小数点后两位

    # 按年月排序
    sorted_monthly_averages = dict(sorted(monthly_averages.items()))

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(sorted_monthly_averages, f, ensure_ascii=False, indent=4)

In [None]:
calculate_monthly_average('all_sentiment_score.json', 'monthly_average_sentiment.json')

In [13]:
# 读取数据
with open('monthly_average_sentiment.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 将数据转换为DataFrame
df = pd.DataFrame(list(data.items()), columns=['date', 'sentiment_score'])
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

print(df.index.min(), df.index.max())

# 设定事件时间点
pre_period = ['1979-01-01', '1989-05-31']
post_period = ['1989-06-01', '1994-12-31']

# 准备数据
data = df['sentiment_score'].astype(float).to_frame()

# 调试信息
print(data.head())
print(data.describe())
print(f"Pre-period: {pre_period}")
print(f"Post-period: {post_period}")
print(data.loc[pre_period[0]:pre_period[1]])
print(data.loc[post_period[0]:post_period[1]])

# 创建CausalImpact对象并进行分析
ci = CausalImpact(data, pre_period, post_period)

# 检查ci.inferences是否为None
if ci.inferences is not None:
    print(ci.summary())
    print(ci.summary(output='report'))
    ci.plot()

    # 比较实际的每月的sentiment score和推断的分数的不同
    actual_vs_predicted = pd.DataFrame({
        'actual': ci.inferences['response'],
        'predicted': ci.inferences['predicted']
    })

    # 可视化
    plt.figure(figsize=(10, 6))
    plt.plot(actual_vs_predicted.index, actual_vs_predicted['actual'], label='Actual')
    plt.plot(actual_vs_predicted.index, actual_vs_predicted['predicted'], label='Predicted', linestyle='--')
    plt.xlabel('Date')
    plt.ylabel('Sentiment Score')
    plt.title('Actual vs Predicted Sentiment Score')
    plt.legend()
    plt.show()
else:
    print("CausalImpact analysis failed. ci.inferences is None.")
    actual_vs_predicted = pd.DataFrame({
        'actual': [],
        'predicted': []
    })
    print(actual_vs_predicted)

1968-03-01 00:00:00 2024-07-01 00:00:00
            sentiment_score
date                       
1968-03-01             4.00
1968-09-01             1.00
1968-10-01             4.00
1969-01-01             2.75
1969-02-01             3.00
       sentiment_score
count       636.000000
mean          3.110692
std           0.696477
min           1.000000
25%           2.720000
50%           3.050000
75%           3.452500
max           5.000000
Pre-period: ['1979-01-01', '1989-05-31']
Post-period: ['1989-06-01', '1994-12-31']
            sentiment_score
date                       
1979-01-01             3.54
1979-02-01             3.33
1979-03-01             4.00
1979-04-01             3.54
1979-05-01             3.80
...                     ...
1989-01-01             2.58
1989-02-01             3.95
1989-03-01             3.50
1989-04-01             1.50
1989-05-01             3.44

[103 rows x 1 columns]
            sentiment_score
date                       
1989-06-01             3.01
19

In [18]:
# 读取数据
with open('monthly_average_sentiment.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 将数据转换为DataFrame
df = pd.DataFrame(list(data.items()), columns=['date', 'sentiment_score'])
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# 调试信息
print(df.head())  # 打印前几行数据
print(df.info())  # 打印DataFrame的信息

print(df.index.min(), df.index.max())

# 设定事件时间点
pre_period = ['1979-01-01', '1989-05-31']
post_period = ['1989-06-01', '1994-12-31']

# 准备数据
data = df['sentiment_score'].astype(float).to_frame()

# 调试信息
print(data.head())
print(data.describe())
print(f"Pre-period: {pre_period}")
print(f"Post-period: {post_period}")
print(data.loc[pre_period[0]:pre_period[1]])
print(data.loc[post_period[0]:post_period[1]])

# 创建CausalImpact对象并进行分析
ci = CausalImpact(data, pre_period, post_period)

# 检查ci.inferences是否为None
if ci.inferences is not None:
    print(ci.summary())
    print(ci.summary(output='report'))
    ci.plot()

    # 比较实际的每月的sentiment score和推断的分数的不同
    actual_vs_predicted = pd.DataFrame({
        'actual': ci.inferences['response'],
        'predicted': ci.inferences['predicted']
    })

    # 可视化
    plt.figure(figsize=(10, 6))
    plt.plot(actual_vs_predicted.index, actual_vs_predicted['actual'], label='Actual')
    plt.plot(actual_vs_predicted.index, actual_vs_predicted['predicted'], label='Predicted', linestyle='--')
    plt.xlabel('Date')
    plt.ylabel('Sentiment Score')
    plt.title('Actual vs Predicted Sentiment Score')
    plt.legend()
    plt.show()
else:
    print("CausalImpact analysis failed. ci.inferences is None.")
    actual_vs_predicted = pd.DataFrame({
        'actual': [],
        'predicted': []
    })
    print(actual_vs_predicted)

            sentiment_score
date                       
1968-03-01             4.00
1968-09-01             1.00
1968-10-01             4.00
1969-01-01             2.75
1969-02-01             3.00
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 636 entries, 1968-03-01 to 2024-07-01
Data columns (total 1 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sentiment_score  636 non-null    float64
dtypes: float64(1)
memory usage: 9.9 KB
None
1968-03-01 00:00:00 2024-07-01 00:00:00
            sentiment_score
date                       
1968-03-01             4.00
1968-09-01             1.00
1968-10-01             4.00
1969-01-01             2.75
1969-02-01             3.00
       sentiment_score
count       636.000000
mean          3.110692
std           0.696477
min           1.000000
25%           2.720000
50%           3.050000
75%           3.452500
max           5.000000
Pre-period: ['1979-01-01', '1989-05-31']
Post-period: [