# CompKey 竞争性关键词推荐算法设计与实现

In [None]:
seedwords = ['图片', '手机', '小说', '视频', '下载', '大全', 'qq', '电影', '中国', '世界']
# , '重生', '百度', '官网','txt', '英语', '电视剧', '游戏', '查询', '做法', '倾城']

## 确定竞争性关键字集合

In [None]:
def read_file_to_dict(file_path: str):
    data_dict = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        current_key = None
        for line in file:
            line = line.strip()
            if line.endswith(':'):
                current_key = line[:-1]
                data_dict[current_key] = []
            else:
                parts = line.split()
                if len(parts) == 3:
                    keyword, freq, weight = parts
                    data_dict[current_key].append({
                        'keyword': keyword,
                        'freq': int(freq),
                        'weight': float(weight)
                    })
    return data_dict


file_path = '../data/temp/seed_mid.train'
midkeys_dict = read_file_to_dict(file_path)
# 只要前十条种子关键词的数据
midkeys_dict = {key: midkeys_dict[key] for key in seedwords}

# 如果中介关键词包含种子关键词子串，则删除
for seedword in seedwords:
    midkeys_dict[seedword] = [midkey for midkey in midkeys_dict[seedword] if seedword not in midkey['keyword']]

# 只要前10个中介关键词
midkeys_dict = {key: midkeys_dict[key][:10] for key in midkeys_dict}

midkeys_dict

In [None]:
# 筛选出不含种子关键词，但含有其中介关键词的搜索数据，存储在相应的文件中
def comkey_words_file(seedword: str):
    query_data = open('../data/temp/cleaned.train', 'r', encoding='utf-8')
    filename = ('../data/temp/compkey_') + seedword
    result_data = open(filename, 'w', encoding='utf-8')
    for line in query_data:
        if seedword not in line:
            for midkeyword in midkeys_dict[seedword]:
                if midkeyword['keyword'] in line:
                    result_data.write(line)
                    break
    query_data.close()
    result_data.close()


for seedword in seedwords:
    print(seedword)
    comkey_words_file(seedword)

In [None]:

# 1. 加载停用词
stopwords_file = '../data/stop_words/merge_stopwords.txt'  # 停用词文件路径
stopwords = set()
with open(stopwords_file, 'r', encoding='utf-8') as file:
    for line in file:
        stopwords.add(line.strip())
stopwords

In [None]:
from collections import Counter


def stop_words_filter(word_list):
    word_cleaned = []
    for word in word_list:
        if word not in stopwords and word.strip() != '':
            word_cleaned.append(word)
    return word_cleaned
    # word_cleaned = []
    # for word in word_list:
    #     if word not in stopwords:
    #         word_cleaned.append(word)
    # return word_cleaned


import jieba

ka_list = []
comp_list = []


def get_compkey_words(seedword: str):
    ka_list_line = []
    comp_list_line = []
    for midkeyword in midkeys_dict[seedword]:
        comkey_data = open('../data/temp/compkey_' + seedword, 'r', encoding='utf-8')
        # 将包含中介关键词的搜索数据进行分词
        compkey_query_list = []
        for line in comkey_data:
            if midkeyword['keyword'] in line:
                line = line.strip()
                word_seg = jieba.lcut(line)
                compkey_query_list.extend(word_seg)
        compkey_query_list = stop_words_filter(compkey_query_list)
        count_result = Counter(compkey_query_list)
        # 打印出现频率最高的三个词
        is_append = False
        for key, val in count_result.most_common(3):
            if key != midkeyword['keyword']:
                # 只把第一个词加入到ka_list_line和comp_list_line中
                if not is_append:
                    ka_list_line.append(val)
                    comp_list_line.append(key)
                    is_append = True
                # ka_list_line.append(val)
                # comp_list_line.append(key)
                print(key, val, end='  ')
        print()
        comkey_data.close()
    ka_list.append(ka_list_line)
    comp_list.append(comp_list_line)



In [None]:
for seedword in seedwords:
    print('\n' + seedword + ':\n')
    get_compkey_words(seedword)

In [None]:
ka_list

In [None]:
comp_list

In [None]:
# 竞争性关键词筛选
# 同时得到|{ka}|的值

ka_list = [[12907, 3002, 1183, 1495, 8434, 3059, 32687, 23197, 10919, 3193],
           [14343, 34432, 10393, 2825, 3805, 5292, 2824, 1021, 5205, 1174],
           [7135, 2128, 756, 3775, 2050, 22595, 3433, 3792, 33049, 46657],
           [832, 17397, 17301, 31527, 22807, 1038, 3586, 529, 34502, 42487],
           [2059, 18414, 5397, 20907, 6621, 14470, 4107, 8198, 35215, 2125],
           [5514, 4265, 1958, 5128, 5659, 4677, 1268, 8809, 158703, 11275],
           [21907, 4737, 2172, 16373, 2050, 159379, 898, 3271, 2213, 34496],
           [876, 1011, 18237, 7889, 22745, 18837, 31519, 5240, 47855, 48585],
           [28837, 456, 32473, 1420, 2220, 2688, 350, 5892, 5982, 5201],
           [533, 2608, 16189, 5719, 21029, 23765, 86, 1745, 34060, 1000]]

comp_list = [['做法', '适合', '2016', '句子', '价格表', '视频', '年', '版', '手机', '头像'],
             ['完整版', 'txt', '头像', '6s', '下载', '荣耀', '路由器', '高清', '教学', '官网'],
             ['火线', '电影', '电影', '2016', '耽', '全文', 'txt', '女主角', 'txt', '微微一笑'],
             ['设计', '观看', '在线', '图片', '版', '游戏', '图片', '教学', 'txt', '荣耀'],
             ['重生', '版', '电视剧', '手机', '手机', '阅读', '手机游戏', '穿越', '云', '官方网站'],
             ['发型', '家常', '好听', '教学', '韩国', '下载', '下载', '倾城', '小说', '手机'],
             ['版', '女生', '重生', '查询', '侠盗', '小说', '163', '背景音乐', '女生', 'txt'],
             ['中国', '视频', '在线', '下载', '版', '观看', '图片', '小说', '倾城', '微微一笑'],
             ['最新', '新', '年', '没', '百度', '大学排名', '人物', '网', '约', '教学'],
             ['魔兽争霸', '月', 'wifi', '异界', '版', '手机', '学', '幸福生活', 'txt', '告白']]

In [None]:
compkeywords = dict(zip(seedwords, comp_list))
compkeywords

In [None]:
# 取出每个种子关键词的中介关键词的keyword，取成二维列表
midkeywords_list = []
for seedword in seedwords:
    midkeywords_list.append([midkey['keyword'] for midkey in midkeys_dict[seedword]])
midkeywords_list

In [None]:

# 每个中介关键词对应的竞争性关键词
compwords = dict.fromkeys(seedwords, {})
i = 0
for list in midkeywords_list:
    compwords[seedwords[i]] = dict(zip(list, comp_list[i]))
    i += 1
compwords

In [None]:
# 将|{ka}|的值存在字典ka_query_list中
ka_query_volume = dict.fromkeys(seedwords, {})
i = 0
for list in midkeywords_list:
    ka_query_volume[seedwords[i]] = dict(zip(list, ka_list[i]))
    i += 1
ka_query_volume

In [None]:
a_query_volume = dict.fromkeys(seedwords, {})


# 计算所有中介关键词的搜索量|{a}|
def count_midkeyword(seedword):
    # count_dict=dict.fromkeys(midkeywords[seedword],0)
    count_dict = dict.fromkeys(midkeywords_list[seedwords.index(seedword)], 0)
    query_data = open('../data/temp/cleaned.train', 'r', encoding='utf-8')
    for sentence in query_data:
        # for midkeyword in midkeywords[seedword]:
        for midkeyword in midkeywords_list[seedwords.index(seedword)]:
            if midkeyword in sentence:
                count_dict[midkeyword] += 1
    query_data.close()
    return count_dict


for seedword in seedwords:
    print(seedword)
    a_query_volume[seedword] = count_midkeyword(seedword)

In [None]:
a_query_volume

In [None]:
sa_query_volume = dict.fromkeys(seedwords, {})


# 把midkey_dict中中的keyword和freq对应起来，存入sa_query_volume中
def get_sa_query_volume():
    for seedword in seedwords:
        sa_query_volume[seedword] = {}
        for midkeyword in midkeys_dict[seedword]:
            sa_query_volume[seedword][midkeyword['keyword']] = midkeyword['freq']
    return sa_query_volume


get_sa_query_volume()

sa_query_volume




In [None]:

# 计算Comp
import copy


def getcomp():
    comp_query_volume = copy.deepcopy(sa_query_volume)
    for seedword in a_query_volume:
        for midkeyword in a_query_volume[seedword]:
            comp_query_volume[seedword][midkeyword] = ka_query_volume[seedword][midkeyword] / (
                    a_query_volume[seedword][midkeyword] - sa_query_volume[seedword][midkeyword])
    return comp_query_volume


result_query_volume = getcomp()
result_query_volume


In [None]:
# 计算w_midkeyword
w_midkeyword = dict.fromkeys(seedwords, {})
for seedword in seedwords:
    w_midkeyword[seedword] = {}
    for midkeyword in midkeys_dict[seedword]:
        w_midkeyword[seedword][midkeyword['keyword']] = midkeyword['weight']
w_midkeyword

In [None]:
def comp_result():
    result = {}
    for seedword in result_query_volume:
        result.setdefault(seedword, {})
        i = 0
        for midkeyword in result_query_volume[seedword]:
            result[seedword][compkeywords[seedword][i]] = w_midkeyword[seedword][midkeyword] * \
                                                          result_query_volume[seedword][midkeyword]
            i += 1
    return result


comp_k_s = comp_result()
#关键词k与种子关键词s的竞争性程度

# 每个种子关键词的竞争性关键词根据竞争度排序
for seedword in seedwords:
    comp_k_s[seedword] = dict(sorted(comp_k_s[seedword].items(), key=lambda x: x[1], reverse=True))
comp_k_s

In [None]:
import pandas as pd


In [None]:
# 根据竞争度排序后，把每个种子关键词的竞争性关键词和竞争度存储到字典中
comp_result_dict = {}
for seedword in seedwords:
    comp_result_dict[seedword] = {}
    # comp_result_dict[seedword][compkeywords[seedword]] = comp_k_s[seedword]
    for compword in compkeywords[seedword]:
        comp_result_dict[seedword][compword] = comp_k_s[seedword][compword]

comp_result_dict