# CompKey

- |{s}| 种子关键词的搜索量
- |{sa}| 联合查询搜索量，种子关键词和对应的中介关键词一起出现的查询搜索
- 中介关键词的权重：$$w_a(k)=\frac{|\{sa\}|}{|\{s\}|}$$ 
- |{ka}| 竞争性关键词的词频（不含有种子关键词，但有中介关键词的搜索）
- |{a}| 所有中介关键词的搜索量
- 竞争性 Comp 测度的计算公式：$$Comp_s(k,s)=\frac{|\{ka\}|}{(|\{a\}|-|\{sa\}|)}$$
- 关键词 k 与种子关键词 s 的竞争性程度：$$Comp(k,s)=\sum_{i=1}^{m}{\{w_{a_i}(k)\times Comp_{a_i}(k,s)\}}$$

In [None]:
import jieba
import time

time_stats = {}

total_start_time = time.time()

# 关键词列表
dirty_seedwords = [
    "图片::", "手机::", "小说::", "视频::", "下载::", "大全::", "qq::", "电影::", "中国::", "世界::",
    # "重生::", "百度::", "官网::", "txt::", "英语::", "电视剧::", "游戏::", "查询::", "做法::", "倾城::"
    # "诛仙::"
]

# 加载自定义词典
jieba.load_userdict('../data/dictionary')

# 种子关键词列表
seedwords = [keyword[:-2] for keyword in dirty_seedwords]

# 种子关键词搜索条目输入文件路径
seed_words_query_file_path = '../data/processed/seed_words_query.train'

In [None]:
# 1. 加载停用词
stopwords_file = '../data/stop_words/merge_stopwords.txt'  # 停用词文件路径
stopwords = set()
with open(stopwords_file, 'r', encoding='utf-8') as file:
    for line in file:
        stopwords.add(line.strip())

stopwords

In [None]:
# 创建一个字典，用于存储包含种子关键词的搜索记录数
s_query_volume = {keyword: 0 for keyword in seedwords}

# 统计种子关键词对应的搜索条目的数量
# 计算 s 的具体时间
with open(seed_words_query_file_path, 'r', encoding='utf-8') as input_data:
    s_start_time = time.time()
    current_seed_keyword = None
    for line in input_data:
        line = line.strip()
        # 读取到新的关键词
        if any(keyword == line for keyword in dirty_seedwords):
            # 重新赋值
            current_seed_keyword = line[:-2]
        else:
            # 统计包含种子关键词的搜索条目的数量
            s_query_volume[current_seed_keyword] += 1
    s_end_time = time.time()
    time_stats['s_time'] = s_end_time - s_start_time

s_query_volume

In [None]:
# 定义一个字典来存储每个词及其对应的分词结果
word_dict = {keyword: [] for keyword in seedwords}
# 创建一个字典，用于存储同时包含种子关键词和中介关键词的搜索记录数
mid_word_dict = {keyword: {} for keyword in seedwords}

# 读取种子关键词搜索条目文件，将搜索条目分词
with open(seed_words_query_file_path, 'r', encoding='utf-8') as train_data:
    # 当前处理的关键词
    current_seed_word = None
    # 逐行处理
    for line in train_data:
        line = line.strip()
        # 判断是否为新关键词行
        if any(keyword == line for keyword in dirty_seedwords):
            current_seed_word = line[:-2]  # 去掉冒号
        else:
            # 使用lcut分词，返回结果为列表形式
            seg_list = [word for word in jieba.lcut(line) if word != '']
            # 去掉与当前关键词相同的部分，同时删除出现在停用词列表中的词
            filtered_segs = [seg for seg in seg_list if seg != current_seed_word and seg not in stopwords]
            # 将分词的结果存储到 word_dict 中
            word_dict[current_seed_word].extend(filtered_segs)

    # 计算sa
    sa_start_time = time.time()
    for keyword, seg_list in word_dict.items():
        for seg in seg_list:
            # 统计包含当前关键词和中介词的搜索记录数
            if seg in mid_word_dict[keyword]:
                mid_word_dict[keyword][seg]['freq'] += 1
            else:
                mid_word_dict[keyword][seg] = {'freq': 1, 'weight': 0}
    sa_end_time = time.time()
    time_stats['sa_time'] = sa_end_time - sa_start_time

    # 每个种子关键词下只选取数目最多的前20个作为中介关键词
    for keyword in mid_word_dict:
        sorted_mid_words = sorted(mid_word_dict[keyword].items(), key=lambda x: x[1]['freq'], reverse=True)
        mid_word_dict[keyword] = {mid_word: info for mid_word, info in sorted_mid_words[:20]}

    # 计算权重
    wight_start_time = time.time()
    for keyword in mid_word_dict:
        for mid_word in mid_word_dict[keyword]:
            # 计算权重
            mid_word_dict[keyword][mid_word]['weight'] = mid_word_dict[keyword][mid_word]['freq'] / s_query_volume[
                keyword]
    wight_end_time = time.time()
    time_stats['wight_time'] = wight_end_time - wight_start_time

mid_word_dict

## 确定竞争性关键字集合

In [None]:
temp = mid_word_dict
mid_word_dict = {}
for k, v in temp.items():
    mid_word_dict[k] = []
    for k1, v1 in v.items():
        mid_word_dict[k].append({'keyword': k1, 'freq': v1['freq'], 'weight': v1['weight']})
mid_word_dict

In [None]:

mid_word_dict = {key: mid_word_dict[key] for key in seedwords}

# 如果中介关键词包含种子关键词子串，则删除
for seedword in seedwords:
    mid_word_dict[seedword] = [midkey for midkey in mid_word_dict[seedword] if seedword not in midkey['keyword']]

# 只要前10个中介关键词
mid_word_dict = {key: mid_word_dict[key][:10] for key in mid_word_dict}

mid_word_dict

In [None]:
# 筛选出不含种子关键词，但含有其中介关键词的搜索数据，存储在相应的文件中
def comkey_words_file(seedword: str):
    query_data = open('../data/temp/cleaned.train', 'r', encoding='utf-8')
    filename = ('../data/temp/compkey_') + seedword
    result_data = open(filename, 'w', encoding='utf-8')
    for line in query_data:
        if seedword not in line:
            for midkeyword in mid_word_dict[seedword]:
                if midkeyword['keyword'] in line:
                    cut_line: list[str] = jieba.lcut(line)
                    # cut_line = stop_words_filter(cut_line)
                    # 写入文件，词与词之间用\t分隔
                    for word in cut_line:
                        result_data.write(word + '\t')
                    result_data.write('\n')
                    # result_data.write(line)
                    break
    query_data.close()
    result_data.close()


for seedword in seedwords:
    print(seedword)
    comkey_words_file(seedword)

In [None]:
from collections import Counter


def stop_words_filter(word_list):
    word_cleaned = []
    for word in word_list:
        if word not in stopwords:
            if word.strip() != '':
                word_cleaned.append(word)
    return word_cleaned


import jieba

jieba.load_userdict('../data/dict/流行网络小说词库.txt')
jieba.load_userdict('../data/dict/网络流行新词【官方推荐】.txt')

ka_list = []
comp_list = []


# 从包含中介词的文件中提取与给定种子词相关的关键词
def get_compkey_words(seedword: str):
    ka_list_line = []
    comp_list_line = []
    for mid_word in mid_word_dict[seedword]:
        compkey_data = open('../data/temp/compkey_' + seedword, 'r', encoding='utf-8')
        # 将包含中介关键词的搜索数据进行分词
        compkey_query_list = []
        for line in compkey_data:
            if mid_word['keyword'] in line:
                # line = line.strip()
                # word_seg = jieba.lcut(line)
                word_seg = line.split('\t')
                compkey_query_list.extend(word_seg)
        compkey_query_list = stop_words_filter(compkey_query_list)
        count_result = Counter(compkey_query_list)
        # 打印出现频率最高的三个词
        is_append = False
        for key, val in count_result.most_common(3):
            if key != mid_word['keyword']:
                # 只把第一个词加入到ka_list_line和comp_list_line中
                if not is_append:
                    ka_list_line.append(val)
                    comp_list_line.append(key)
                    is_append = True
                print(key, val, end='  ')
        print()
        compkey_data.close()
    ka_list.append(ka_list_line)
    comp_list.append(comp_list_line)


ka_start_time = time.time()
for seedword in seedwords:
    print('\n' + seedword + ':\n')
    get_compkey_words(seedword)
ka_end_time = time.time()
time_stats['ka_time'] = ka_end_time - ka_start_time

In [None]:
ka_list

In [None]:
comp_list

In [None]:
# 将种子关键词和竞争词合并成一个字典
compkeywords = dict(zip(seedwords, comp_list))
compkeywords

In [None]:
# 取出每个种子关键词的中介关键词的keyword，取成二维列表
midkeywords_list = []
for seedword in seedwords:
    midkeywords_list.append([midkey['keyword'] for midkey in mid_word_dict[seedword]])
midkeywords_list

In [None]:
# 每个中介关键词对应的竞争性关键词
# 种子关键词：{竞争关键词：中介关键词}
compwords = dict.fromkeys(seedwords, {})
i = 0
for each_list in midkeywords_list:
    compwords[seedwords[i]] = dict(zip(each_list, comp_list[i]))
    i += 1
compwords

In [None]:
# 将|{ka}|的值存在字典ka_query_list中
# 种子关键词：{竞争关键词：ka}
ka_query_volume = dict.fromkeys(seedwords, {})
i = 0
for each_list in midkeywords_list:
    ka_query_volume[seedwords[i]] = dict(zip(each_list, ka_list[i]))
    i += 1
ka_query_volume

## 计算竞争性关键字的竞争度

Comps(k, s) = |{ka}| / (|{a}| - |{sa}|)

计算 |{a}|

In [None]:
a_query_volume = dict.fromkeys(seedwords, {})


# 计算所有中介关键词的搜索量|{a}|
def count_midkeyword(seedword):
    # count_dict=dict.fromkeys(midkeywords[seedword],0)
    count_dict = dict.fromkeys(midkeywords_list[seedwords.index(seedword)], 0)
    query_data = open('../data/temp/cleaned.train', 'r', encoding='utf-8')
    for sentence in query_data:
        # for midkeyword in midkeywords[seedword]:
        for midkeyword in midkeywords_list[seedwords.index(seedword)]:
            if midkeyword in sentence:
                count_dict[midkeyword] += 1
    query_data.close()
    return count_dict


a_start_time = time.time()
for seedword in seedwords:
    print(seedword)
    a_query_volume[seedword] = count_midkeyword(seedword)
a_end_time = time.time()
time_stats['a_time'] = a_end_time - a_start_time

a_query_volume

In [None]:
sa_query_volume = dict.fromkeys(seedwords, {})


# 把midkey_dict中中的keyword和freq对应起来，存入sa_query_volume中
def get_sa_query_volume():
    for seedword in seedwords:
        sa_query_volume[seedword] = {}
        for midkeyword in mid_word_dict[seedword]:
            sa_query_volume[seedword][midkeyword['keyword']] = midkeyword['freq']
    return sa_query_volume


get_sa_query_volume()

sa_query_volume

In [None]:

# 计算Comp
import copy


def getcomp():
    comp_start_time = time.time()
    comp_query_volume = copy.deepcopy(sa_query_volume)
    for seedword in a_query_volume:
        for midkeyword in a_query_volume[seedword]:
            comp_query_volume[seedword][midkeyword] = ka_query_volume[seedword][midkeyword] / (
                    a_query_volume[seedword][midkeyword] - sa_query_volume[seedword][midkeyword])
    comp_end_time = time.time()
    time_stats['comp_time'] = comp_end_time - comp_start_time
    return comp_query_volume


result_query_volume = getcomp()
result_query_volume


In [None]:
# 计算w_midkeyword
w_midkeyword = dict.fromkeys(seedwords, {})
for seedword in seedwords:
    w_midkeyword[seedword] = {}
    for midkeyword in mid_word_dict[seedword]:
        w_midkeyword[seedword][midkeyword['keyword']] = midkeyword['weight']
w_midkeyword

In [None]:
def comp_result():
    comp_k_s_start_time = time.time()
    result = {}
    for seedword in result_query_volume:
        result.setdefault(seedword, {})
        i = 0
        for midkeyword in result_query_volume[seedword]:
            result[seedword][compkeywords[seedword][i]] = w_midkeyword[seedword][midkeyword] * \
                                                          result_query_volume[seedword][midkeyword]
            i += 1
    comp_k_s_end_time = time.time()
    time_stats['comp_k_s_time'] = comp_k_s_end_time - comp_k_s_start_time
    return result


comp_k_s = comp_result()

#关键词k与种子关键词s的竞争性程度

# 每个种子关键词的竞争性关键词根据竞争度排序
for seedword in seedwords:
    comp_k_s[seedword] = dict(sorted(comp_k_s[seedword].items(), key=lambda x: x[1], reverse=True))
comp_k_s

In [None]:
# 把每个种子关键词的竞争性关键词和竞争度存储到字典中
comp_result_dict = {}
for seedword in seedwords:
    comp_result_dict[seedword] = {}
    # comp_result_dict[seedword][compkeywords[seedword]] = comp_k_s[seedword]
    for compword in compkeywords[seedword]:
        comp_result_dict[seedword][compword] = comp_k_s[seedword][compword]

# 给每个种子关键词的竞争性关键词和竞争度排序
for seedword in seedwords:
    comp_result_dict[seedword] = dict(sorted(comp_result_dict[seedword].items(), key=lambda x: x[1], reverse=True))

# 排除单个字的竞争性关键词
for seedword in seedwords:
    comp_result_dict[seedword] = {k: v for k, v in comp_result_dict[seedword].items() if len(k) > 1}

# 排除txt
for seedword in seedwords:
    comp_result_dict[seedword] = {k: v for k, v in comp_result_dict[seedword].items() if k != 'txt'}

# # 取出每个种子关键词的前五个竞争性关键词
# for seedword in seedwords:
#     comp_result_dict[seedword] = dict(list(comp_result_dict[seedword].items())[:5])
total_end_time = time.time()
time_stats['total_time'] = total_end_time - total_start_time

comp_result_dict

In [None]:
time_stats

## 各时间占比扇形统计图

In [None]:
# import matplotlib.pyplot as plt
#
# # 时间统计数据
# time_stats = {
#     's_time': 1.796548843383789,
#     'sa_time': 1.326596975326538,
#     'wight_time': 0.0,
#     'ka_time': 33.054537296295166,
#     'a_time': 70.60850167274475,
#     'comp_time': 0.0,
#     'comp_k_s_time': 0.0,
#     'total_time': 400.392245054245
# }
#
# # 计算各部分占total_time的比例
# total_time = time_stats['total_time']
# data = {
#     key: value for key, value in time_stats.items() if value > 0 and key != 'total_time'
# }
#
# # 将未列出的部分归入其他时间
# other_time = total_time - sum(data.values())
# if other_time > 0:
#     data['other_time'] = other_time
#
# # 绘制扇形统计图
# labels = data.keys()
# sizes = data.values()
#
# # 设置比较浅的颜色
# colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99', '#c2c2f0', '#ffb3e6']
#
#
# # 自定义百分比显示
# def func(pct):
#     return f'{pct:.1f}%' if pct > 1 else ''  # 设定阈值
#
#
# # 绘制饼图
# wedges, texts, autotexts = plt.pie(sizes, labels=None, autopct=lambda pct: func(pct),
#                                    startangle=140, colors=colors, pctdistance=0.85,
#                                    textprops=dict(color="black"))
#
# plt.axis('equal')  # 使饼图为圆形
# plt.title('The proportion of each part in the total time')
#
# # 添加图例并调整位置和字体大小
# plt.legend(labels, loc='upper left', title="Time statistics", title_fontsize='10', fontsize='9', bbox_to_anchor=(1, 1))
#
# plt.tight_layout()  # 自动调整布局以避免重叠
# plt.show()
