## 算法在不同数据规模下的运行时间

In [None]:
import jieba
import time
import matplotlib.pyplot as plt

# 设置关键词列表
dirty_seedwords = [
    "图片::", "手机::", "小说::", "视频::", "下载::", "大全::", "qq::", "电影::", "中国::", "世界::"
]

# 加载自定义词典
jieba.load_userdict('../data/dictionary')
seedwords = [keyword[:-2] for keyword in dirty_seedwords]

# 种子关键词搜索条目输入文件路径
seed_words_query_file_path = '../data/processed/seed_words_query.train'

# 加载停用词
stopwords_file = '../data/stop_words/merge_stopwords.txt'
stopwords = set()
with open(stopwords_file, 'r', encoding='utf-8') as file:
    for line in file:
        stopwords.add(line.strip())

# 存储每个种子关键词的运行时间和搜索条目数量
data_sizes = []  # 存储不同数据规模
run_times = []   # 存储对应的运行时间

# 计算每个种子关键词的搜索条目数量并记录运行时间
for seedword in seedwords:
    s_query_volume = 0
    start_time = time.time()  # 记录开始时间
    
    with open(seed_words_query_file_path, 'r', encoding='utf-8') as input_data:
        lines = input_data.readlines()  # 读取所有行
        for i, line in enumerate(lines):
            line = line.strip()
            # 检查行是否以种子关键词加 "::" 开头
            if line.startswith(seedword + "::"):
                s_query_volume += 1  # 统计关键词行
                # 统计后面的相关项
                for item in lines[i + 1:]:
                    item = item.strip()
                    if item == "":
                        break  # 遇到空行停止统计
                    s_query_volume += 1  # 统计每个相关项

    end_time = time.time()  # 记录结束时间
    
    print(f"Seed word: {seedword}, Search Entries: {s_query_volume}")

    # 记录数据规模和运行时间
    data_sizes.append(s_query_volume)
    run_times.append(end_time - start_time)

# 绘制折线图
plt.figure(figsize=(10, 6))
plt.plot(data_sizes, run_times, marker='o', linestyle='-', color='b')
plt.title('Algorithm Running Time vs Data Size')
plt.xlabel('Number of Search Entries')
plt.ylabel('Running Time (seconds)')
plt.xticks(data_sizes)  # X轴使用数据规模作为刻度
plt.grid()
plt.show()

# 输出时间统计信息
for seedword, run_time in zip(seedwords, run_times):
    print(f"Seed word: {seedword}, Running time: {run_time:.4f} seconds")
