In [None]:
# 2读取数据
import pandas as pd
import re
import jieba
import jieba.analyse
from collections import Counter
df = pd.read_csv(r'd:\桌面\Sina Visitor System25.csv', encoding='utf-8')
# 2. 标准化时间格式（只保留年月日）
def clean_date(t):
    t = str(t).strip()
    match = re.search(r'(\d{4})年(\d{1,2})月(\d{1,2})日', t)
    if match:
        year, month, day = match.groups()
        return f"{year}-{int(month):02d}-{int(day):02d}"
    return ''

df['时间'] = df['时间'].apply(clean_date)

# 3. 点赞列处理
def clean_like(x):
    try:
        x = str(x).strip()
        if x == '赞':
            return 0
        return int(x)
    except:
        return 0

df['喜欢'] = df['喜欢'].apply(clean_like)

# 4. 清洗正文内容
def clean_text(text):
    text = str(text)
    text = re.sub(r'收起d|O网页链接|L[\w\W]*?微博视频', '', text)
    text = re.sub(r'#.*?#', '', text)
    text = re.sub(r'[【】（）\[\]<>]|[^\u4e00-\u9fa5a-zA-Z0-9#@，。！？、\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['正文'] = df['正文'].apply(clean_text)

# 用户匿名化
df['Username'] = ['User_{:03d}'.format(i+1) for i in range(len(df))]

# 3列重命名为英文
df = df.rename(columns={"名称": "Username", "时间": "Timestamp", "正文": "Content", "喜欢": "Likes"})

print(df.columns)  # 检查是否已变为英文


# 4后续处理
def refine_content(content):
    if pd.isna(content):
        return ""
    content = str(content).strip()
    content = re.sub(r"[!]{2,}", "!", content)
    content = re.sub(r"[?]{2,}", "?", content)
    content = re.sub(r"[~`@#$%^&*+=|{}:;\"<>,./-]+", "", content)
    irrelevant_hashtags = r"#(转发|抽奖|福利|关注)[^#]*#"
    content = re.sub(irrelevant_hashtags, "", content)
    return content

df["Content"] = df["Content"].apply(refine_content)
df = df[df["Content"] != ""].reset_index(drop=True)

# 5关键词提取
stop_words = set([
    "的", "了", "是", "在", "我", "你", "他", "她", "它", "我们", "你们", "他们",
    "和", "就", "都", "也", "还", "这个", "那个", "这样", "那样", "所以", "因为",
    "什么", "怎么", "但是", "如果", "可以", "没有", "已经", "自己", "现在", "今天"
])

def extract_keywords(content, top_k=5):
    if not content:
        return ""
    keywords = jieba.analyse.extract_tags(
        content, topK=top_k, withWeight=False, allowPOS=("n", "v", "a", "ns")
    )
    keywords = [kw for kw in keywords if kw not in stop_words and len(kw) > 1]
    return ", ".join(keywords[:top_k])

df["Keywords"] = df["Content"].apply(extract_keywords)
print("\n样例关键词：")
print(df[["Content", "Keywords"]].head())

# 6关键词频率分析
all_keywords = ",".join(df["Keywords"].dropna()).split(", ")
keyword_freq = Counter(all_keywords).most_common(10)
print("\n前 10 个关键词：")
for kw, freq in keyword_freq:
    print(f"{kw}: {freq}")

# 7添加情绪列
if "Emotion" not in df.columns:
    df["Emotion"] = ""
    print("已添加空的 'Emotion' 列用于情绪标注。")

# 保存
output_file = "25.csv"
df.to_csv(output_file, index=False, encoding="utf-8-sig")
print(f"\n数据集已保存为 '{output_file}'。")
print("输出列：", df.columns.tolist())

In [None]:
#判断情绪标签
import os
print(os.getcwd())
import pandas as pd
import re

# 加载数据

file_path = r'd:\下载\25.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig')


# 清洗并拆分句子
def split_sentences(text):
    text = str(text)
    text = re.sub(r'[哈哈哈]+', '', text)
    text = re.sub(r'[！？!.。]+', '。', text)
    sentences = re.split(r'。|，|但|然后|于是|所以|因为|而且|并且', text)
    sentences = [s.strip() for s in sentences if len(s.strip()) > 3]
    return sentences

# 简单情绪词表
positive_words = ['开心', '喜欢', '幸福', '感动', '细腻', '记住', '惊喜', '实现', '可爱']
negative_words = ['烦', '不开心', '生气', '失望', '难过', '糟糕']

# 判断情绪标签
def get_sentiment(sentences):
    score = 0
    for s in sentences:
        for w in positive_words:
            if w in s:
                score += 1
        for w in negative_words:
            if w in s:
                score -= 1
    if score > 0:
        return '正向'
    elif score < 0:
        return '负向'
    else:
        return '中性'

# 应用处理流程
df['sentences'] = df['Content'].apply(split_sentences)
df['Emotion'] = df['sentences'].apply(get_sentiment)

# 保存结果
output_file = r'd:\下载\25终.csv'
df.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"处理完成，已保存为 {output_file}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import jieba
from collections import Counter
from scipy.stats import chi2_contingency

# --- 添加以下代码来设置中文字体 ---
plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置中文字体为黑体，根据你的系统可能需要更换其他字体
plt.rcParams['axes.unicode_minus'] = False    # 解决负号'-'显示为方块的问题
# ------------------------------------

df = pd.read_csv('d:/桌面/新20-25已处理数据.csv', encoding='gbk') # 确保使用gbk编码

# 1. 统计情绪分布
emotion_counts = df['Emotion'].value_counts(normalize=True) * 100

# 可视化1：情绪分布条形图
plt.figure(figsize=(10, 7)) # 稍微增大图表尺寸，给标题和标签更多空间
emotion_counts.plot(kind='bar', color=['#66c2a5', '#fc8d62', '#8da0cb'])
plt.title('图4-1：泡泡玛特用户评论总体情绪极性分布', fontsize=18) # 增大字体，如果太小容易被忽略
plt.xlabel('情绪极性', fontsize=15)
plt.ylabel('发帖占比 (%)', fontsize=15)
plt.xticks(rotation=0, fontsize=13)
plt.yticks(fontsize=13)
plt.grid(axis='y', linestyle='--', alpha=0.7)
for index, value in enumerate(emotion_counts):
    plt.text(index, value + 0.5, f'{value:.1f}%', ha='center', va='bottom', fontsize=12)
plt.tight_layout() # 自动调整布局，防止元素重叠或被截断
plt.show()

# 在Jupyter中，最好也显式保存图片，以便查看完整效果
# plt.savefig('D:/桌面/图4-1_泡泡玛特情绪分布.png', dpi=300, bbox_inches='tight')

In [None]:
#数据可视化代码
import pandas as pd
import matplotlib.pyplot as plt
import re
import jieba
from collections import Counter
from scipy.stats import chi2_contingency # 用于卡方检验

# --- 全局中文字体设置 (请根据你的操作系统选择合适的字体) ---
plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置中文字体
plt.rcParams['axes.unicode_minus'] = False    # 解决负号'-'显示为方块的问题
# -----------------------------------------------------------

# 使用之前确认成功的'gbk'编码
try:
    df = pd.read_csv('d:/桌面/新20-25已处理数据.csv', encoding='gbk')
except FileNotFoundError:
    print("错误：文件 'd:/桌面/新20-25已处理数据.csv' 未找到。请检查文件路径是否正确。")
    exit() # 如果文件未找到，退出脚本

# (一) 数据概况与初步情绪分布

# 1. 统计情绪分布
emotion_counts = df['Emotion'].value_counts(normalize=True) * 100

# 可视化1：情绪分布条形图 (图4-1)
plt.figure(figsize=(10, 7)) # 稍微增大图表尺寸
emotion_counts.plot(kind='bar', color=['#66c2a5', '#fc8d62', '#8da0cb']) # 选用更专业的颜色
plt.title('图4-1：泡泡玛特用户评论总体情绪极性分布', fontsize=18) # 增大标题字体
plt.xlabel('情绪极性', fontsize=15)
plt.ylabel('发帖占比 (%)', fontsize=15)
plt.xticks(rotation=0, fontsize=13)
plt.yticks(fontsize=13)
plt.grid(axis='y', linestyle='--', alpha=0.7)
for index, value in enumerate(emotion_counts):
    plt.text(index, value + 0.5, f'{value:.1f}%', ha='center', va='bottom', fontsize=12)
plt.tight_layout() # 自动调整布局
plt.show()


# (二) 用户情绪时序分析与关键品牌事件匹配

# 1. 情绪时序趋势分析 (图4-2)
# 确保Timestamp列是datetime类型
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Month'] = df['Timestamp'].dt.to_period('M') # 按月聚合

# 计算每月情绪占比
monthly_emotion_counts = df.groupby(['Month', 'Emotion']).size().unstack(fill_value=0)
# 确保所有情绪类别都在列中，即使某个月份没有
for emo in ['正向', '负向', '中性']:
    if emo not in monthly_emotion_counts.columns:
        monthly_emotion_counts[emo] = 0
monthly_emotion_percentages = monthly_emotion_counts.div(monthly_emotion_counts.sum(axis=1), axis=0) * 100

# 可视化2：情绪随时间折线图 (图4-2)
plt.figure(figsize=(18, 8)) # 进一步增大折线图尺寸
monthly_emotion_percentages[['正向', '负向', '中性']].plot(
    kind='line', ax=plt.gca(), marker='o', linewidth=2,
    color=['#66c2a5', '#fc8d62', '#8da0cb'] # 保持颜色一致性
)
plt.title('图4-2：泡泡玛特用户评论情绪月度趋势', fontsize=18)
plt.xlabel('月份', fontsize=15)
plt.ylabel('发帖占比 (%)', fontsize=15)
plt.xticks(rotation=45, fontsize=13)
plt.yticks(fontsize=13)
plt.legend(title='情绪极性', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12, title_fontsize=14)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

# 3. 关键品牌事件匹配与情绪堆叠分析 (图4-4 & 图4-5)

# 定义所有可能的情绪类别，用于reindex
all_emotions = ['正向', '负向', '中性']

# --- 图4-4：积极事件分析 (2024年10月Labubu第三代搪胶毛绒产品在美国多地线下门店引发抢购热潮) ---
# 选择一个较新的，且你认为能引发显著积极情绪的事件
# 这里选择Labubu第三代搪胶毛绒产品在美国多地线下门店引发抢购热潮(2024年10月)
positive_event_date_str_1 = '2024-10-20' # updated by user
positive_event_name_1 = 'Labubu第三代搪胶毛绒引发抢购热潮'
positive_event_date_1 = pd.to_datetime(positive_event_date_str_1)
time_window_days = 60# 事件前后30天

df_pre_event_1 = df[
    (df['Timestamp'] >= positive_event_date_1 - pd.Timedelta(days=time_window_days)) &
    (df['Timestamp'] < positive_event_date_1)
]
df_post_event_1 = df[
    (df['Timestamp'] > positive_event_date_1) &
    (df['Timestamp'] <= positive_event_date_1 + pd.Timedelta(days=time_window_days))
]

pre_emotion_1 = df_pre_event_1['Emotion'].value_counts(normalize=True).reindex(all_emotions, fill_value=0) * 100
post_emotion_1 = df_post_event_1['Emotion'].value_counts(normalize=True).reindex(all_emotions, fill_value=0) * 100

plot_data_event_1 = pd.DataFrame({
    '事件前': pre_emotion_1,
    '事件后': post_emotion_1
})
plot_data_event_1 = plot_data_event_1.T

plt.figure(figsize=(10, 7))
plot_data_event_1.plot(kind='bar', stacked=True, ax=plt.gca(), color=['#66c2a5', '#fc8d62', '#8da0cb'])
plt.title(f'图4-4：{positive_event_name_1} 前后用户情绪分布对比', fontsize=18)
plt.xlabel('时间段', fontsize=15)
plt.ylabel('发帖占比 (%)', fontsize=15)
plt.xticks(rotation=0, fontsize=13)
plt.yticks(fontsize=13)
plt.legend(title='情绪极性', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12, title_fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


# --- 图4-5：负面事件分析 (2025年3月，labubu爆火导致黄牛，二级市场溢价严重) ---
negative_event_date_str_2 = '2025-03-1' # updated by user
negative_event_name_2 = 'labubu爆火导致二级市场溢价' # 负面事件
negative_event_date_2 = pd.to_datetime(negative_event_date_str_2)
time_window_days = 60# 事件前后30天

df_pre_event_2 = df[
    (df['Timestamp'] >= negative_event_date_2 - pd.Timedelta(days=time_window_days)) &
    (df['Timestamp'] < negative_event_date_2)
]
df_post_event_2 = df[
    (df['Timestamp'] > negative_event_date_2) &
    (df['Timestamp'] <= negative_event_date_2 + pd.Timedelta(days=time_window_days))
]

# 关键修改：在value_counts后使用.reindex(all_emotions, fill_value=0)
pre_emotion_2 = df_pre_event_2['Emotion'].value_counts(normalize=True).reindex(all_emotions, fill_value=0) * 100
post_emotion_2 = df_post_event_2['Emotion'].value_counts(normalize=True).reindex(all_emotions, fill_value=0) * 100

plot_data_event_2 = pd.DataFrame({
    '事件前': pre_emotion_2,
    '事件后': post_emotion_2
})
plot_data_event_2 = plot_data_event_2.T

plt.figure(figsize=(10, 7))
plot_data_event_2.plot(kind='bar', stacked=True, ax=plt.gca(), color=['#66c2a5', '#fc8d62', '#8da0cb'])
plt.title(f'图4-5：{negative_event_name_2} 前后用户情绪分布对比', fontsize=18)
plt.xlabel('时间段', fontsize=15)
plt.ylabel('发帖占比 (%)', fontsize=15)
plt.xticks(rotation=0, fontsize=13)
plt.yticks(fontsize=13)
plt.legend(title='情绪极性', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12, title_fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# --- 新增图表：评论总数月度趋势及关键事件标记 (图4-3 或 4-6) ---
monthly_total_comments = df.groupby('Month').size()

plt.figure(figsize=(18, 8))
monthly_total_comments.plot(kind='line', marker='o', linewidth=2, color='darkblue')

# 标记积极事件
plt.axvline(x=positive_event_date_1.to_period('M'), color='green', linestyle='--', label=f'{positive_event_name_1} ({positive_event_date_str_1})')
# 标记负面事件
plt.axvline(x=negative_event_date_2.to_period('M'), color='red', linestyle='--', label=f'{negative_event_name_2} ({negative_event_date_str_2})')

plt.title('图4-6：泡泡玛特用户发帖总数月度趋势及关键事件标记', fontsize=18)
plt.xlabel('月份', fontsize=15)
plt.ylabel('发帖总数', fontsize=15)
plt.xticks(rotation=45, fontsize=13)
plt.yticks(fontsize=13)
plt.legend(fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

import pandas as pd
from scipy.stats import chi2_contingency

# Assuming 'df' (your DataFrame) is already loaded and 'Timestamp' is in datetime format.
# If not, you'd need to include the df loading and Timestamp conversion lines from your earlier code.
# For example:
# df = pd.read_csv('d:/桌面/新20-25已处理数据.csv', encoding='gbk')
# df['Timestamp'] = pd.to_datetime(df['Timestamp'])



# (五) 统计检验与初步推论 (表4-10)

# --- 卡方检验针对第一个事件 (积极事件：Labubu第三代抢购热潮) ---
event_date_str_chi2_1 = positive_event_date_str_1
event_name_chi2_1 = positive_event_name_1
event_date_chi2_1 = pd.to_datetime(event_date_str_chi2_1)

df_for_chi2_1 = df.copy()
df_for_chi2_1['period'] = '其他'
df_for_chi2_1.loc[(df_for_chi2_1['Timestamp'] >= event_date_chi2_1 - time_window_chi2) &
                (df_for_chi2_1['Timestamp'] < event_date_chi2_1), 'period'] = '事件前'
df_for_chi2_1.loc[(df_for_chi2_1['Timestamp'] > event_date_chi2_1) &
                (df_for_chi2_1['Timestamp'] <= event_date_chi2_1 + time_window_chi2), 'period'] = '事件后'

df_event_period_1 = df_for_chi2_1[df_for_chi2_1['period'].isin(['事件前', '事件后'])]
contingency_table_1 = pd.crosstab(df_event_period_1['period'], df_event_period_1['Emotion'])

for emo in ['正向', '负向', '中性']:
    if emo not in contingency_table_1.columns:
        contingency_table_1[emo] = 0
contingency_table_1 = contingency_table_1.loc[['事件前', '事件后'], ['正向', '负向', '中性']]

print(f"\n--- 卡方检验结果 ({event_name_chi2_1} 事件前后情绪分布) ---")
if not contingency_table_1.empty and contingency_table_1.sum().sum() > 0:
    if contingency_table_1.shape[0] < 2 or contingency_table_1.shape[1] < 2:
        print(f"错误：{event_name_chi2_1} 事件前后数据不足，列联表维度小于2x2，无法进行卡方检验。")
    elif contingency_table_1.min().min() == 0 and contingency_table_1.shape[0] > 1 and contingency_table_1.shape[1] > 1:
        chi2, p, dof, expected = chi2_contingency(contingency_table_1)
        print("\n警告：列联表中存在0值，或期望频数可能过低。卡方检验在小样本或0值较多的情况下可能不准确。")
        print(f"列联表:\n{contingency_table_1}")
        print(f"卡方值 (Chi-square): {chi2:.2f}")
        print(f"自由度 (Degrees of Freedom): {dof}")
        print(f"p 值 (p-value): {p:.3f}")
        if p < 0.05:
            print("结论: 在该事件前后，情绪类别分布存在显著差异。")
        else:
            print("结论: 在该事件前后，情绪类别分布无显著差异。")
    else:
        chi2, p, dof, expected = chi2_contingency(contingency_table_1)
        print(f"列联表:\n{contingency_table_1}")
        print(f"卡方值 (Chi-square): {chi2:.2f}")
        print(f"自由度 (Degrees of Freedom): {dof}")
        print(f"p 值 (p-value): {p:.3f}")
        if p < 0.05:
            print("结论: 在该事件前后，情绪类别分布存在显著差异。")
        else:
            print("结论: 在该事件前后，情绪类别分布无显著差异。")
else:
    print(f"\n{event_name_chi2_1} 事件前后数据不足或列联表为空，无法进行卡方检验。")


# --- 卡方检验针对第二个事件 (负面事件：Labubu断货与黄牛炒作) ---
event_date_str_chi2_2 = negative_event_date_str_2
event_name_chi2_2 = negative_event_name_2
event_date_chi2_2 = pd.to_datetime(event_date_str_chi2_2)

df_for_chi2_2 = df.copy()
df_for_chi2_2['period'] = '其他'
df_for_chi2_2.loc[(df_for_chi2_2['Timestamp'] >= event_date_chi2_2 - time_window_chi2) &
                (df_for_chi2_2['Timestamp'] < event_date_chi2_2), 'period'] = '事件前'
df_for_chi2_2.loc[(df_for_chi2_2['Timestamp'] > event_date_chi2_2) &
                (df_for_chi2_2['Timestamp'] <= event_date_chi2_2 + time_window_chi2), 'period'] = '事件后'

df_event_period_2 = df_for_chi2_2[df_for_chi2_2['period'].isin(['事件前', '事件后'])]
contingency_table_2 = pd.crosstab(df_event_period_2['period'], df_event_period_2['Emotion'])

for emo in ['正向', '负向', '中性']:
    if emo not in contingency_table_2.columns:
        contingency_table_2[emo] = 0
contingency_table_2 = contingency_table_2.loc[['事件前', '事件后'], ['正向', '负向', '中性']]

print(f"\n--- 卡方检验结果 ({event_name_chi2_2} 事件前后情绪分布) ---")
if not contingency_table_2.empty and contingency_table_2.sum().sum() > 0:
    if contingency_table_2.shape[0] < 2 or contingency_table_2.shape[1] < 2:
        print(f"错误：{event_name_chi2_2} 事件前后数据不足，列联表维度小于2x2，无法进行卡方检验。")
    elif contingency_table_2.min().min() == 0 and contingency_table_2.shape[0] > 1 and contingency_table_2.shape[1] > 1:
        chi2, p, dof, expected = chi2_contingency(contingency_table_2)
        print("\n警告：列联表中存在0值，或期望频数可能过低。卡方检验在小样本或0值较多的情况下可能不准确。")
        print(f"列联表:\n{contingency_table_2}")
        print(f"卡方值 (Chi-square): {chi2:.2f}")
        print(f"自由度 (Degrees of Freedom): {dof}")
        print(f"p 值 (p-value): {p:.3f}")
        if p < 0.05:
            print("结论: 在该事件前后，情绪类别分布存在显著差异。")
        else:
            print("结论: 在该事件前后，情绪类别分布无显著差异。")
    else:
        chi2, p, dof, expected = chi2_contingency(contingency_table_2)
        print(f"列联表:\n{contingency_table_2}")
        print(f"卡方值 (Chi-square): {chi2:.2f}")
        print(f"自由度 (Degrees of Freedom): {dof}")
        print(f"p 值 (p-value): {p:.3f}")
        if p < 0.05:
            print("结论: 在该事件前后，情绪类别分布存在显著差异。")
        else:
            print("结论: 在该事件前后，情绪类别分布无显著差异。")
else:
    print(f"\n{event_name_chi2_2} 事件前后数据不足或列联表为空，无法进行卡方检验。")