# CompKey

- |{s}| 种子关键词的搜索量
- |{sa}| 联合查询搜索量，种子关键词和对应的中介关键词一起出现的查询搜索
- 中介关键词的权重：$$w_a(k)=\frac{|\{sa\}|}{|\{s\}|}$$ 
- |{ka}| 竞争性关键词的词频（不含有种子关键词，但有中介关键词的搜索）
- |{a}| 所有中介关键词的搜索量
- 竞争性 Comp 测度的计算公式：$$Comp_s(k,s)=\frac{|\{ka\}|}{(|\{a\}|-|\{sa\}|)}$$
- 关键词 k 与种子关键词 s 的竞争性程度：$$Comp(k,s)=\sum_{i=1}^{m}{\{w_{a_i}(k)\times Comp_{a_i}(k,s)\}}$$

In [None]:
import jieba

# 关键词列表
dirty_seedwords = [
    "图片::", "手机::", "小说::", "视频::", "下载::", "大全::", "qq::", "电影::", "中国::", "世界::",
    # "重生::", "百度::", "官网::", "txt::", "英语::", "电视剧::", "游戏::", "查询::", "做法::", "倾城::"
]
# 加载自定义词典
jieba.load_userdict('../data/dictionary')
# 种子关键词列表
seedwords = [keyword[:-2] for keyword in dirty_seedwords]

# 种子关键词搜索条目输入文件路径
seed_words_query_file_path = '../data/processed/seed_words_query.train'

In [None]:
# 1. 加载停用词
stopwords_file = '../data/stop_words/merge_stopwords.txt'  # 停用词文件路径
stopwords = set()
with open(stopwords_file, 'r', encoding='utf-8') as file:
    for line in file:
        stopwords.add(line.strip())

In [None]:
# 创建一个字典，用于存储包含种子关键词的搜索记录数
s_query_volume = {keyword: 0 for keyword in seedwords}

# 统计种子关键词对应的搜索条目的数量
# 计算 s 的具体时间
with open(seed_words_query_file_path, 'r', encoding='utf-8') as input_data:
    current_seed_keyword = None
    for line in input_data:
        line = line.strip()
        # 读取到新的关键词
        if any(keyword == line for keyword in dirty_seedwords):
            # 重新赋值
            current_seed_keyword = line[:-2]
        else:
            # 统计包含种子关键词的搜索条目的数量
            s_query_volume[current_seed_keyword] += 1

s_query_volume

In [None]:
# 创建一个字典，用于存储同时包含种子关键词和中介关键词的搜索记录数
mid_word_dict = {keyword: {} for keyword in seedwords}

# 读取种子关键词搜索条目文件，将搜索条目分词
# 计算 sa，freq就是sa
with open(seed_words_query_file_path, 'r', encoding='utf-8') as train_data:
    # 当前处理的关键词
    current_seed_word = None
    # 逐行处理
    for line in train_data:
        line = line.strip()
        # 判断是否为新关键词行
        if any(keyword == line for keyword in dirty_seedwords):
            current_seed_word = line[:-2]  # 去掉冒号
        else:
            # 使用lcut分词，返回结果为列表形式
            seg_list = [word for word in jieba.lcut(line) if word != '']
            # 去掉与当前关键词相同的部分，同时删除出现在停用词列表中的词
            filtered_segs = [seg for seg in seg_list if seg != current_seed_word and seg not in stopwords]
            # 根据分词的结果，统计包含种子关键词和中介关键词的搜索记录数目，包在{}，统计sa
            for seg in filtered_segs:
                if seg not in mid_word_dict[current_seed_word]:
                    mid_word_dict[current_seed_word][seg] = {'freq': 0}
                else:
                    mid_word_dict[current_seed_word][seg]['freq'] = mid_word_dict[current_seed_word][seg]['freq'] + 1

# 每个种子关键词下只选取数目最多的前20个作为中介关键词
for keyword in mid_word_dict:
    sorted_mid_words = sorted(mid_word_dict[keyword].items(), key=lambda x: x[1]['freq'], reverse=True)
    mid_word_dict[keyword] = {mid_word: info for mid_word, info in sorted_mid_words[:20]}

# 计算权重
for keyword in mid_word_dict:
    for mid_word in mid_word_dict[keyword]:
        # 计算权重
        mid_word_dict[keyword][mid_word]['weight'] = mid_word_dict[keyword][mid_word]['freq'] / s_query_volume[keyword]

mid_word_dict

In [None]:
# 中介关键词缓存
mid_word_dict = {'图片': {'大全': {'freq': 32222, 'weight': 0.10697164862890911},
                          '发型': {'freq': 5682, 'weight': 0.018863289290219774},
                          '报价': {'freq': 4263, 'weight': 0.01415244671668548},
                          '唯美': {'freq': 4169, 'weight': 0.013840382444724787},
                          '价格': {'freq': 4012, 'weight': 0.01331916871389682},
                          '搞笑': {'freq': 3683, 'weight': 0.012226943762034393},
                          '2016': {'freq': 3675, 'weight': 0.01220038510059093},
                          '手机': {'freq': 3324, 'weight': 0.01103512382975898},
                          'qq': {'freq': 3304, 'weight': 0.010968727176150323},
                          '动漫': {'freq': 3144, 'weight': 0.010437553947281058},
                          '背景图片': {'freq': 2823, 'weight': 0.009371887656862095},
                          '动态': {'freq': 2698, 'weight': 0.00895690857180798},
                          '简笔画': {'freq': 2646, 'weight': 0.00878427727242547},
                          '手抄报': {'freq': 2583, 'weight': 0.008575127813558197},
                          '带字': {'freq': 2404, 'weight': 0.007980877763760707},
                          '文字': {'freq': 2327, 'weight': 0.007725250647367373},
                          '可爱': {'freq': 2324, 'weight': 0.007715291149326074},
                          '女生': {'freq': 2283, 'weight': 0.007579178009428325},
                          '头发': {'freq': 2255, 'weight': 0.007486222694376203},
                          '微信': {'freq': 2246, 'weight': 0.007456344200252307}},
                 '手机': {'手机版': {'freq': 23463, 'weight': 0.11010943835410722},
                          '下载': {'freq': 11997, 'weight': 0.05630068328577865},
                          'qq': {'freq': 11858, 'weight': 0.055648370626220155},
                          '苹果': {'freq': 10206, 'weight': 0.04789570506081994},
                          '软件': {'freq': 7256, 'weight': 0.03405165940831957},
                          '手机号': {'freq': 5103, 'weight': 0.02394785253040997},
                          '华为': {'freq': 5055, 'weight': 0.02372259348250488},
                          '手机游戏': {'freq': 4335, 'weight': 0.020343707763928517},
                          '壁纸': {'freq': 4225, 'weight': 0.019827489112479352},
                          '设置': {'freq': 4166, 'weight': 0.019550608199429345},
                          '视频': {'freq': 3811, 'weight': 0.01788462982429794},
                          '小米': {'freq': 3427, 'weight': 0.016082557441057216},
                          '三星': {'freq': 3364, 'weight': 0.015786904940681783},
                          '图片': {'freq': 3150, 'weight': 0.014782625018771587},
                          '手机号码': {'freq': 3143, 'weight': 0.014749774740952095},
                          '世界': {'freq': 3130, 'weight': 0.014688767082144467},
                          '手机卡': {'freq': 3111, 'weight': 0.0145996020423487},
                          '密码': {'freq': 3034, 'weight': 0.014238248986334284},
                          '在线': {'freq': 2831, 'weight': 0.013285590929569005},
                          '游戏': {'freq': 2830, 'weight': 0.013280898032737649}},
                 '小说': {'穿越': {'freq': 8148, 'weight': 0.0456560109825456},
                          '好看': {'freq': 6076, 'weight': 0.03404589135124534},
                          '类似': {'freq': 5815, 'weight': 0.03258341971815202},
                          '排行榜': {'freq': 5232, 'weight': 0.029316672736951224},
                          '耽美': {'freq': 5214, 'weight': 0.029215812624324097},
                          '阅读': {'freq': 4713, 'weight': 0.026408539489535764},
                          '重生': {'freq': 4606, 'weight': 0.025808982153363403},
                          '主角': {'freq': 4566, 'weight': 0.02558484856974757},
                          '下载': {'freq': 4476, 'weight': 0.02508054800661194},
                          '小说网': {'freq': 4409, 'weight': 0.024705124254055418},
                          '倾城': {'freq': 4122, 'weight': 0.0230969657916118},
                          '女主': {'freq': 3927, 'weight': 0.022004314571484603},
                          '微微一笑': {'freq': 3905, 'weight': 0.021881041100495897},
                          '免费': {'freq': 3852, 'weight': 0.021584064102204915},
                          '都市': {'freq': 3787, 'weight': 0.021219847028829182},
                          '完结': {'freq': 3601, 'weight': 0.02017762586501555},
                          '同人小说': {'freq': 3279, 'weight': 0.01837335051690808},
                          '玄幻': {'freq': 2652, 'weight': 0.014860056593729864},
                          '言情小说': {'freq': 2645, 'weight': 0.014820833216597091},
                          'txt': {'freq': 2464, 'weight': 0.013806628750735439}},
                 '视频': {'教学': {'freq': 5210, 'weight': 0.031226213237277266},
                          '在线': {'freq': 4942, 'weight': 0.029619951212787765},
                          '观看': {'freq': 4913, 'weight': 0.02944613927730196},
                          '大全': {'freq': 4548, 'weight': 0.027258506296187524},
                          '手机': {'freq': 3866, 'weight': 0.023170929054762746},
                          '腾讯': {'freq': 3301, 'weight': 0.019784593070297937},
                          '在线视频': {'freq': 3282, 'weight': 0.019670716284979652},
                          '搞笑': {'freq': 3091, 'weight': 0.018525954916780045},
                          '舞蹈': {'freq': 3025, 'weight': 0.01813038292567442},
                          '下载': {'freq': 2997, 'weight': 0.017962564505205367},
                          '视频教程': {'freq': 2947, 'weight': 0.017662888754367775},
                          '2016': {'freq': 2588, 'weight': 0.015511216863353851},
                          '直播': {'freq': 2423, 'weight': 0.014522286885589791},
                          '青青草': {'freq': 2407, 'weight': 0.014426390645321762},
                          '王者荣耀': {'freq': 2385, 'weight': 0.014294533314953221},
                          '吻戏': {'freq': 2356, 'weight': 0.014120721379467417},
                          '超碰': {'freq': 2159, 'weight': 0.012939998921167296},
                          '微信': {'freq': 2146, 'weight': 0.012862083225949523},
                          '马蓉': {'freq': 2006, 'weight': 0.01202299112360426},
                          '播放': {'freq': 1920, 'weight': 0.0115075488321636}},
                 '下载': {'txt': {'freq': 34758, 'weight': 0.2546149789029536},
                          '全集': {'freq': 7497, 'weight': 0.05491824894514768},
                          '手机': {'freq': 7153, 'weight': 0.05239832395686826},
                          '手机版': {'freq': 4783, 'weight': 0.03503721284575715},
                          '软件': {'freq': 4530, 'weight': 0.033183895921237695},
                          '免费': {'freq': 4309, 'weight': 0.031564990623534926},
                          '游戏': {'freq': 4084, 'weight': 0.029916783872480077},
                          '小说': {'freq': 3820, 'weight': 0.02798288795124238},
                          'qq': {'freq': 3254, 'weight': 0.023836732301922174},
                          '官方': {'freq': 3039, 'weight': 0.02226177918424754},
                          '视频': {'freq': 2999, 'weight': 0.021968764650726676},
                          'app': {'freq': 2861, 'weight': 0.0209578645100797},
                          '迅雷': {'freq': 2378, 'weight': 0.017419714017815283},
                          '下载安装': {'freq': 2306, 'weight': 0.01689228785747773},
                          '百度云': {'freq': 2280, 'weight': 0.01670182841068917},
                          '世界': {'freq': 2191, 'weight': 0.01604987107360525},
                          '重生': {'freq': 2027, 'weight': 0.014848511486169714},
                          '版': {'freq': 1936, 'weight': 0.014181903422409752},
                          '网': {'freq': 1845, 'weight': 0.013515295358649789},
                          '优酷': {'freq': 1794, 'weight': 0.01314170182841069}},
                 '大全': {'图片': {'freq': 31363, 'weight': 0.2300993382342152},
                          '做法': {'freq': 13029, 'weight': 0.09558920632125721},
                          '名字': {'freq': 5491, 'weight': 0.04028554239849745},
                          '视频': {'freq': 5098, 'weight': 0.03740223914542706},
                          '电影': {'freq': 3525, 'weight': 0.02586168948364661},
                          '游戏': {'freq': 3279, 'weight': 0.024056873706915526},
                          '歌曲': {'freq': 3112, 'weight': 0.022831653240598083},
                          'qq': {'freq': 2982, 'weight': 0.021877888805740195},
                          '电视剧': {'freq': 2980, 'weight': 0.021863215506742383},
                          '说': {'freq': 2960, 'weight': 0.021716482516764245},
                          '2016': {'freq': 2587, 'weight': 0.018979912253671995},
                          '全集': {'freq': 2422, 'weight': 0.017769365086352365},
                          '图案': {'freq': 2305, 'weight': 0.016910977094980264},
                          '字': {'freq': 2151, 'weight': 0.015781133072148613},
                          '搞笑': {'freq': 1961, 'weight': 0.014387169667356313},
                          '旅游景点': {'freq': 1833, 'weight': 0.013448078531496236},
                          '成语': {'freq': 1826, 'weight': 0.013396721985003888},
                          '漫画': {'freq': 1796, 'weight': 0.013176622500036683},
                          '手抄报': {'freq': 1604, 'weight': 0.01176798579624657},
                          '图解': {'freq': 1583, 'weight': 0.011613916156769526}},
                 'qq': {'头像': {'freq': 10564, 'weight': 0.1033942763183651},
                        '手机': {'freq': 10057, 'weight': 0.09843205574912892},
                        '空间': {'freq': 6802, 'weight': 0.06657401244959481},
                        '号': {'freq': 4061, 'weight': 0.03974670164037114},
                        '飞车': {'freq': 3940, 'weight': 0.038562424147515956},
                        '说': {'freq': 3865, 'weight': 0.037828367850291665},
                        '邮箱': {'freq': 3711, 'weight': 0.03632110558665779},
                        '音乐': {'freq': 3221, 'weight': 0.031525271111459106},
                        '网名': {'freq': 3126, 'weight': 0.030595466468308342},
                        '下载': {'freq': 3108, 'weight': 0.030419292956974514},
                        '好友': {'freq': 3062, 'weight': 0.029969071761343617},
                        '群': {'freq': 2996, 'weight': 0.02932310221978624},
                        '大全': {'freq': 2939, 'weight': 0.028765219433895782},
                        '/': {'freq': 2801, 'weight': 0.027414555847003093},
                        '女生': {'freq': 2642, 'weight': 0.025858356496887603},
                        '男生': {'freq': 2488, 'weight': 0.02435109423325373},
                        '图片': {'freq': 2163, 'weight': 0.02117018361194848},
                        '密码': {'freq': 2044, 'weight': 0.020005480953685942},
                        '浏览器': {'freq': 2028, 'weight': 0.01984888227694476},
                        '中心': {'freq': 1988, 'weight': 0.019457385585091806}},
                 '电影': {'韩国': {'freq': 5688, 'weight': 0.05157265778712678},
                          '电影网': {'freq': 4368, 'weight': 0.03960431948209736},
                          '完整版': {'freq': 4209, 'weight': 0.038162678731718816},
                          '观看': {'freq': 3825, 'weight': 0.03468098031571026},
                          '全集': {'freq': 3741, 'weight': 0.033919358787208385},
                          '在线': {'freq': 3576, 'weight': 0.03242331649907971},
                          '大全': {'freq': 3573, 'weight': 0.03239611573020464},
                          '手机': {'freq': 3064, 'weight': 0.027781051944401627},
                          '电影院': {'freq': 2616, 'weight': 0.02371907045905831},
                          '好看': {'freq': 2391, 'weight': 0.021679012793428293},
                          '微微一笑': {'freq': 2056, 'weight': 0.018641593602379162},
                          '倾城': {'freq': 2031, 'weight': 0.01841492052842027},
                          '天堂': {'freq': 1796, 'weight': 0.016284193633206698},
                          '神马': {'freq': 1730, 'weight': 0.01568577671795523},
                          '2016': {'freq': 1642, 'weight': 0.014887887497619932},
                          '下载': {'freq': 1511, 'weight': 0.013700120590075346},
                          '最新': {'freq': 1319, 'weight': 0.011959271382071066},
                          '日本': {'freq': 1318, 'weight': 0.01195020445911271},
                          '爱情': {'freq': 1296, 'weight': 0.011750732154028887},
                          '美国': {'freq': 1285, 'weight': 0.011650996001486976}},
                 '中国': {'中国女排': {'freq': 5533, 'weight': 0.04503463263362662},
                          '新': {'freq': 4860, 'weight': 0.039556897632283636},
                          '歌声': {'freq': 4555, 'weight': 0.03707441743108065},
                          '中国移动': {'freq': 4205, 'weight': 0.034225669659208376},
                          '2016': {'freq': 3455, 'weight': 0.02812121014805349},
                          '声音': {'freq': 2799, 'weight': 0.02278184289563002},
                          '地图': {'freq': 2723, 'weight': 0.022163257665166327},
                          '中国银行': {'freq': 2589, 'weight': 0.021072594232506653},
                          '排名': {'freq': 2351, 'weight': 0.019135445747633505},
                          '关系': {'freq': 2240, 'weight': 0.01823198573998258},
                          '中国式': {'freq': 2085, 'weight': 0.016970397441010573},
                          '官网': {'freq': 2053, 'weight': 0.016709940501867963},
                          '奥运会': {'freq': 1907, 'weight': 0.015521605717029815},
                          '视频': {'freq': 1796, 'weight': 0.014618145709378891},
                          '营业厅': {'freq': 1712, 'weight': 0.013934446244129544},
                          '图片': {'freq': 1657, 'weight': 0.013486785879978187},
                          '里': {'freq': 1628, 'weight': 0.013250746778880197},
                          '年': {'freq': 1534, 'weight': 0.012485654520148786},
                          '约': {'freq': 1528, 'weight': 0.012436818844059547},
                          '十大': {'freq': 1525, 'weight': 0.012412401006014928}},
                 '世界': {'魔兽': {'freq': 4918, 'weight': 0.051067453065293235},
                          '两个': {'freq': 4040, 'weight': 0.04195049011463698},
                          '异': {'freq': 3214, 'weight': 0.03337348396743645},
                          '从零开始': {'freq': 2778, 'weight': 0.028846153846153848},
                          '生活': {'freq': 2697, 'weight': 0.02800506728692474},
                          '手机版': {'freq': 2468, 'weight': 0.025627180594783187},
                          '下载': {'freq': 2235, 'weight': 0.02320775876391427},
                          '全世界': {'freq': 2179, 'weight': 0.022626266821731186},
                          '完美': {'freq': 2176, 'weight': 0.022595115467685663},
                          '暖': {'freq': 2171, 'weight': 0.02254319654427646},
                          '世界杯': {'freq': 2094, 'weight': 0.021743645123774715},
                          '盒子': {'freq': 1961, 'weight': 0.020362601761089883},
                          '做': {'freq': 1709, 'weight': 0.01774588802126599},
                          '解说': {'freq': 1657, 'weight': 0.017205931217810267},
                          '电影': {'freq': 1489, 'weight': 0.015461455391261007},
                          '排名': {'freq': 1467, 'weight': 0.015233012128260509},
                          '玩': {'freq': 1459, 'weight': 0.015149941850805782},
                          '视频': {'freq': 1413, 'weight': 0.014672287755441103},
                          '武侠': {'freq': 1352, 'weight': 0.014038876889848811},
                          '第一': {'freq': 1312, 'weight': 0.013623525502575179}},
                 '重生': {'小说': {'freq': 4657, 'weight': 0.06292222882775766},
                          'txt': {'freq': 3421, 'weight': 0.046222234232286656},
                          '末世': {'freq': 2340, 'weight': 0.03161649462249365},
                          '空间': {'freq': 2175, 'weight': 0.0293871264119332},
                          '下载': {'freq': 2038, 'weight': 0.027536075231043615},
                          '豪门': {'freq': 2005, 'weight': 0.027090201588931526},
                          '军嫂': {'freq': 1724, 'weight': 0.02329351996973464},
                          '文': {'freq': 1653, 'weight': 0.02233421607306923},
                          '女主': {'freq': 1607, 'weight': 0.021712695238609955},
                          '千金': {'freq': 1289, 'weight': 0.017416094687347998},
                          '女': {'freq': 1204, 'weight': 0.016267632275847158},
                          '耽美': {'freq': 1099, 'weight': 0.01484894341458142},
                          '系统': {'freq': 1029, 'weight': 0.01390315084040426},
                          '娱乐圈': {'freq': 989, 'weight': 0.013362697940874453},
                          '重': {'freq': 988, 'weight': 0.013349186618386208},
                          '巨星': {'freq': 983, 'weight': 0.013281630005944981},
                          '书包': {'freq': 933, 'weight': 0.012606063881532724},
                          '网': {'freq': 900, 'weight': 0.012160190239420634},
                          '嫡女': {'freq': 869, 'weight': 0.011741339242285034},
                          '军婚': {'freq': 844, 'weight': 0.011403556180078905}},
                 '百度': {'百度云': {'freq': 38820, 'weight': 0.5601085011831246},
                          '资源': {'freq': 6591, 'weight': 0.0950972470710452},
                          '贴吧': {'freq': 4222, 'weight': 0.060916488717031224},
                          '下载': {'freq': 3189, 'weight': 0.04601200438621804},
                          '百科': {'freq': 2520, 'weight': 0.036359439025797885},
                          '分享': {'freq': 2356, 'weight': 0.033993189819357075},
                          '云盘': {'freq': 2330, 'weight': 0.03361805275004329},
                          '地图': {'freq': 2263, 'weight': 0.03265135337911929},
                          '翻译': {'freq': 1851, 'weight': 0.026706873665377735},
                          '链接': {'freq': 1557, 'weight': 0.02246493911236798},
                          '电影': {'freq': 1372, 'weight': 0.019795694580712182},
                          '视频': {'freq': 1356, 'weight': 0.019564840999596007},
                          '倾城': {'freq': 1234, 'weight': 0.017804582443585158},
                          '百度网盘': {'freq': 1222, 'weight': 0.017631442257748025},
                          '手机': {'freq': 1199, 'weight': 0.017299590234893518},
                          '微微一笑': {'freq': 1101, 'weight': 0.015885612050556934},
                          'txt': {'freq': 976, 'weight': 0.014082068448086802},
                          '青云志': {'freq': 939, 'weight': 0.013548219541755642},
                          '老九门': {'freq': 902, 'weight': 0.013014370635424481},
                          '本子': {'freq': 819, 'weight': 0.011816817683384313}},
                 '官网': {'学院': {'freq': 3239, 'weight': 0.04454989340485524},
                          '手游': {'freq': 2047, 'weight': 0.02815487242968159},
                          '手机': {'freq': 1827, 'weight': 0.02512894573963276},
                          '苹果': {'freq': 1805, 'weight': 0.02482635307062788},
                          '职业': {'freq': 1728, 'weight': 0.023767278729110792},
                          '首页': {'freq': 1579, 'weight': 0.021717901107214083},
                          'lol': {'freq': 1488, 'weight': 0.02046626779451207},
                          '网': {'freq': 1386, 'weight': 0.019063338147307613},
                          '12306': {'freq': 1301, 'weight': 0.017894230107970567},
                          '小米': {'freq': 1278, 'weight': 0.01757788322673819},
                          '技术': {'freq': 1201, 'weight': 0.0165188088852211},
                          '中国': {'freq': 1193, 'weight': 0.01640877518740114},
                          '下载': {'freq': 1090, 'weight': 0.01499209132796919},
                          '查询': {'freq': 1070, 'weight': 0.014717007083419296},
                          '网上': {'freq': 973, 'weight': 0.013382848497352315},
                          '大学': {'freq': 934, 'weight': 0.012846434220480022},
                          'cf': {'freq': 849, 'weight': 0.011677326181142976},
                          '商城': {'freq': 845, 'weight': 0.011622309332232997},
                          '华为': {'freq': 806, 'weight': 0.011085895055360704},
                          'qq': {'freq': 744, 'weight': 0.010233133897256035}},
                 'txt': {'下载': {'freq': 35192, 'weight': 0.5323732300617209},
                         '全集': {'freq': 6765, 'weight': 0.10233873895679535},
                         '重生': {'freq': 3379, 'weight': 0.0511164226068014},
                         '小说': {'freq': 2100, 'weight': 0.031768122957763524},
                         '微盘': {'freq': 1619, 'weight': 0.024491710032675782},
                         '免费': {'freq': 1367, 'weight': 0.02067953527774416},
                         '网': {'freq': 1343, 'weight': 0.020316471015369722},
                         '系统': {'freq': 1334, 'weight': 0.020180321916979306},
                         '书包': {'freq': 1307, 'weight': 0.01977187462180806},
                         '都市': {'freq': 1046, 'weight': 0.015823550768486024},
                         '超级': {'freq': 950, 'weight': 0.014371293718988262},
                         '穿': {'freq': 898, 'weight': 0.01358465448384364},
                         '百度云': {'freq': 891, 'weight': 0.013478760740651095},
                         '快': {'freq': 879, 'weight': 0.013297228609463875},
                         '倾城': {'freq': 865, 'weight': 0.013085441123078785},
                         '穿越': {'freq': 790, 'weight': 0.01195086530315866},
                         '全文': {'freq': 742, 'weight': 0.011224736778409778},
                         '网游': {'freq': 707, 'weight': 0.010695268062447054},
                         '极品': {'freq': 688, 'weight': 0.010407842188067288},
                         '世界': {'freq': 653, 'weight': 0.009878373472104563}},
                 '英语': {'说': {'freq': 12635, 'weight': 0.16068930433676715},
                          '上册': {'freq': 3586, 'weight': 0.04560600279791428},
                          '作文': {'freq': 3367, 'weight': 0.04282080630802493},
                          '答案': {'freq': 3159, 'weight': 0.0401755055322396},
                          '英语单词': {'freq': 3034, 'weight': 0.038585781508330155},
                          '四级': {'freq': 2996, 'weight': 0.03810250540506168},
                          '英语翻译': {'freq': 2995, 'weight': 0.03808978761287041},
                          '大学': {'freq': 2984, 'weight': 0.03794989189876637},
                          '写': {'freq': 2460, 'weight': 0.03128576879053796},
                          '翻译': {'freq': 2250, 'weight': 0.02861503243037009},
                          '级': {'freq': 1970, 'weight': 0.02505405061681292},
                          '2016': {'freq': 1724, 'weight': 0.021925473737759126},
                          '教程': {'freq': 1291, 'weight': 0.016418669718936792},
                          '九年': {'freq': 1274, 'weight': 0.01620246725168511},
                          '人教版': {'freq': 1228, 'weight': 0.01561744881088643},
                          '八年级': {'freq': 1209, 'weight': 0.015375810759252194},
                          '课文': {'freq': 1174, 'weight': 0.014930688032557548},
                          '考研': {'freq': 1166, 'weight': 0.014828945695027343},
                          '考试': {'freq': 1136, 'weight': 0.014447411929289075},
                          '必修': {'freq': 1116, 'weight': 0.014193056085463564}},
                 '电视剧': {'倾城': {'freq': 8794, 'weight': 0.13913456213907127},
                            '微微一笑': {'freq': 8712, 'weight': 0.1378371964243335},
                            '全集': {'freq': 5398, 'weight': 0.08540463570920022},
                            '大全': {'freq': 2894, 'weight': 0.045787516810378924},
                            '演过': {'freq': 2770, 'weight': 0.04382564670516573},
                            '主演': {'freq': 1722, 'weight': 0.02724468000949292},
                            '好看': {'freq': 1585, 'weight': 0.025077129973894472},
                            '最新': {'freq': 1143, 'weight': 0.018084012340795822},
                            '老九门': {'freq': 1115, 'weight': 0.0176410094138122},
                            '2016': {'freq': 1102, 'weight': 0.017435329483426944},
                            '演员表': {'freq': 869, 'weight': 0.013748912269598923},
                            '古装': {'freq': 832, 'weight': 0.013163515544656277},
                            '韩国': {'freq': 813, 'weight': 0.012862906415631675},
                            '麻雀': {'freq': 755, 'weight': 0.01194525749545131},
                            '观看': {'freq': 726, 'weight': 0.011486433035361126},
                            '微微': {'freq': 652, 'weight': 0.010315639585475832},
                            '爱情': {'freq': 630, 'weight': 0.009967565857131556},
                            '演': {'freq': 629, 'weight': 0.009951744324024998},
                            '播': {'freq': 614, 'weight': 0.009714421327426628},
                            '在线': {'freq': 609, 'weight': 0.009635313661893838}},
                 '游戏': {'下载': {'freq': 4996, 'weight': 0.06077119571828245},
                          '手机游戏': {'freq': 4335, 'weight': 0.05273081133682034},
                          '大全': {'freq': 4218, 'weight': 0.051307626809390584},
                          '小游戏': {'freq': 4043, 'weight': 0.04917893200340591},
                          '单机游戏': {'freq': 3317, 'weight': 0.04034788955114949},
                          '手机': {'freq': 3285, 'weight': 0.0399586425009123},
                          '玩': {'freq': 3183, 'weight': 0.03871791752828123},
                          '玩游戏': {'freq': 3116, 'weight': 0.0379029315168471},
                          '好玩': {'freq': 2417, 'weight': 0.029400316263228317},
                          '名字': {'freq': 2304, 'weight': 0.028025787617078215},
                          '橙光': {'freq': 1444, 'weight': 0.017564773141953535},
                          '电脑': {'freq': 1354, 'weight': 0.016470015813161414},
                          '游戏王': {'freq': 1350, 'weight': 0.016421359931881766},
                          '什么游戏': {'freq': 1349, 'weight': 0.016409195961561852},
                          '4399': {'freq': 1323, 'weight': 0.01609293273324413},
                          '破解版': {'freq': 1205, 'weight': 0.014657584235494465},
                          '中心': {'freq': 1199, 'weight': 0.014584600413574992},
                          '腾讯': {'freq': 1093, 'weight': 0.013295219559664274},
                          '类似': {'freq': 1071, 'weight': 0.0130276122126262},
                          '排行榜': {'freq': 1016, 'weight': 0.012358593845031018}},
                 '查询': {'快递单号': {'freq': 9257, 'weight': 0.15198830985452993},
                          '快递': {'freq': 3556, 'weight': 0.0583850523757922},
                          '成绩': {'freq': 3191, 'weight': 0.052392210948018256},
                          '号': {'freq': 2777, 'weight': 0.04559485108199521},
                          '违章': {'freq': 1877, 'weight': 0.03081798180803205},
                          '火车票': {'freq': 1810, 'weight': 0.02971792598430368},
                          '单': {'freq': 1806, 'weight': 0.029652251009752734},
                          '2016': {'freq': 1632, 'weight': 0.026795389616786524},
                          '韵达': {'freq': 1593, 'weight': 0.026155058614914788},
                          'lol': {'freq': 1543, 'weight': 0.025334121433027944},
                          '余额': {'freq': 1506, 'weight': 0.024726627918431682},
                          '天气预报': {'freq': 1506, 'weight': 0.024726627918431682},
                          '系统': {'freq': 1455, 'weight': 0.023889271992907104},
                          '公积金': {'freq': 1429, 'weight': 0.023462384658325945},
                          '天': {'freq': 1132, 'weight': 0.018586017797918103},
                          '圆通': {'freq': 1129, 'weight': 0.018536761567004893},
                          '物流': {'freq': 1071, 'weight': 0.017584474436016156},
                          '时刻表': {'freq': 1065, 'weight': 0.017485961974189734},
                          '官网': {'freq': 1055, 'weight': 0.017321774537812366},
                          '开奖': {'freq': 1038, 'weight': 0.01704265589597084}},
                 '做法': {'大全': {'freq': 12875, 'weight': 0.21519664376807235},
                          '家常': {'freq': 5286, 'weight': 0.08835180263751692},
                          '汤': {'freq': 1862, 'weight': 0.031122031122031123},
                          '炒': {'freq': 1747, 'weight': 0.029199886342743484},
                          '鸡蛋': {'freq': 1093, 'weight': 0.01826873255444684},
                          '月饼': {'freq': 1092, 'weight': 0.018252018252018252},
                          '红烧': {'freq': 1043, 'weight': 0.017433017433017433},
                          '茄子': {'freq': 1016, 'weight': 0.01698173126744555},
                          '粥': {'freq': 1012, 'weight': 0.0169148740577312},
                          '豆腐': {'freq': 1002, 'weight': 0.016747731033445318},
                          '土豆': {'freq': 932, 'weight': 0.01557772986344415},
                          '正宗': {'freq': 897, 'weight': 0.014992729278443565},
                          '视频': {'freq': 866, 'weight': 0.014474585903157332},
                          '排骨': {'freq': 818, 'weight': 0.013672299386585101},
                          '牛肉': {'freq': 812, 'weight': 0.013572013572013573},
                          '凉拌': {'freq': 739, 'weight': 0.012351869494726638},
                          '炖': {'freq': 738, 'weight': 0.01233515519229805},
                          '鱼': {'freq': 721, 'weight': 0.01205101205101205},
                          '蛋糕': {'freq': 714, 'weight': 0.011934011934011933},
                          '家常菜': {'freq': 675, 'weight': 0.011282154139296997}},
                 '倾城': {'微微一笑': {'freq': 50628, 'weight': 0.8353904032737113},
                          '电视剧': {'freq': 9265, 'weight': 0.1528776978417266},
                          '小说': {'freq': 4284, 'weight': 0.07068840340571579},
                          '一笑倾城': {'freq': 2089, 'weight': 0.03446967196884694},
                          '妖': {'freq': 1997, 'weight': 0.032951620355092075},
                          '微微': {'freq': 1979, 'weight': 0.03265461025674873},
                          '电影': {'freq': 1910, 'weight': 0.03151607154643258},
                          '半': {'freq': 1900, 'weight': 0.03135106593624183},
                          '集': {'freq': 1749, 'weight': 0.02885948122236156},
                          '一笑': {'freq': 1566, 'weight': 0.025839878555870898},
                          '美人': {'freq': 1428, 'weight': 0.0235628011352386},
                          '更新': {'freq': 1347, 'weight': 0.022226255692693552},
                          '演员表': {'freq': 1189, 'weight': 0.019619167051679758},
                          '百度云': {'freq': 1141, 'weight': 0.018827140122764173},
                          'txt': {'freq': 975, 'weight': 0.016088046993597782},
                          '插曲': {'freq': 930, 'weight': 0.015345521747739423},
                          'ko': {'freq': 856, 'weight': 0.0141244802323279},
                          '游戏': {'freq': 799, 'weight': 0.013183948254240643},
                          '时间': {'freq': 773, 'weight': 0.012754933667744703},
                          '下载': {'freq': 762, 'weight': 0.012573427496534883}}}

## 确定竞争性关键字集合

In [None]:
temp = mid_word_dict
mid_word_dict = {}
for k, v in temp.items():
    mid_word_dict[k] = []
    for k1, v1 in v.items():
        mid_word_dict[k].append({'keyword': k1, 'freq': v1['freq'], 'weight': v1['weight']})
mid_word_dict

In [None]:

mid_word_dict = {key: mid_word_dict[key] for key in seedwords}

# 如果中介关键词包含种子关键词子串，则删除
for seedword in seedwords:
    mid_word_dict[seedword] = [midkey for midkey in mid_word_dict[seedword] if seedword not in midkey['keyword']]

# 只要前10个中介关键词
mid_word_dict = {key: mid_word_dict[key][:10] for key in mid_word_dict}

mid_word_dict

In [None]:
# 筛选出不含种子关键词，但含有其中介关键词的搜索数据，存储在相应的文件中
def comkey_words_file(seedword: str):
    query_data = open('../data/temp/cleaned.train', 'r', encoding='utf-8')
    filename = ('../data/temp/compkey_') + seedword
    result_data = open(filename, 'w', encoding='utf-8')
    for line in query_data:
        if seedword not in line:
            for midkeyword in mid_word_dict[seedword]:
                if midkeyword['keyword'] in line:
                    result_data.write(line)
                    break
    query_data.close()
    result_data.close()


for seedword in seedwords:
    print(seedword)
    comkey_words_file(seedword)

In [None]:
from collections import Counter


def stop_words_filter(word_list):
    word_cleaned = []
    for word in word_list:
        if word not in stopwords and word.strip() != '':
            word_cleaned.append(word)
    return word_cleaned


import jieba

jieba.load_userdict('../data/dict/流行网络小说词库.txt')
jieba.load_userdict('../data/dict/网络流行新词【官方推荐】.txt')

ka_list = []
comp_list = []


# 从包含中介词的文件中提取与给定种子词相关的关键词
def get_compkey_words(seedword: str):
    ka_list_line = []
    comp_list_line = []
    for mid_word in mid_word_dict[seedword]:
        compkey_data = open('../data/temp/compkey_' + seedword, 'r', encoding='utf-8')
        # 将包含中介关键词的搜索数据进行分词
        compkey_query_list = []
        for line in compkey_data:
            if mid_word['keyword'] in line:
                line = line.strip()
                word_seg = jieba.lcut(line)
                compkey_query_list.extend(word_seg)
        compkey_query_list = stop_words_filter(compkey_query_list)
        count_result = Counter(compkey_query_list)
        # 打印出现频率最高的三个词
        is_append = False
        for key, val in count_result.most_common(3):
            if key != mid_word['keyword']:
                # 只把第一个词加入到ka_list_line和comp_list_line中
                if not is_append:
                    ka_list_line.append(val)
                    comp_list_line.append(key)
                    is_append = True
                print(key, val, end='  ')
        print()
        compkey_data.close()
    ka_list.append(ka_list_line)
    comp_list.append(comp_list_line)


for seedword in seedwords:
    print('\n' + seedword + ':\n')
    get_compkey_words(seedword)

In [None]:
ka_list

In [None]:
comp_list

In [None]:
# 缓存ka_list和comp_list
ka_list = [[12907, 3002, 1183, 1495, 8434, 3059, 32665, 23397, 10427, 3193],
           [34782, 10393, 2825, 3805, 5226, 1021, 2824, 5205, 1194, 1889],
           [6995, 2128, 756, 3776, 2053, 22595, 25052, 3795, 33393, 46657],
           [832, 17397, 17301, 31526, 22987, 1020, 3586, 529, 34854, 32624],
           [1205, 5397, 18670, 2291, 5809, 14470, 4107, 6890, 10572, 2125],
           [5512, 4265, 1958, 5128, 5657, 4677, 1268, 10388, 8809, 158449],
           [4737, 22212, 2053, 16373, 2050, 159125, 898, 3271, 2213, 34848],
           [874, 1011, 18237, 7888, 18837, 31518, 22944, 5221, 47856, 48585],
           [28837, 456, 32451, 1421, 2220, 2688, 350, 3226, 5982, 5201],
           [533, 2608, 5692, 86, 1683, 4619, 34410, 1000, 5737, 515]]

comp_list = [['做法', '适合', '2016', '句子', '价格表', '视频', '年', '手机版', '头像', '头像'],
             ['txt', '头像', '6s', '下载', '荣耀', '高清', '路由器', '教学', '官网', 'note7'],
             ['穿越火线', '电影', '文', '2016', '耽', '全文', '重生之', '女主角', 'txt', '微微一笑'],
             ['设计', '观看', '在线', '图片', '手机版', '游戏', '图片', '教学', 'txt', '年'],
             ['重生之', '电视剧', '手机版', '世界', '手机', '阅读', '手机游戏', '穿越', '头像', '官方网站'],
             ['发型', '家常', '好听', '教学', '韩国', '下载', '下载', '头像', '倾城', '小说'],
             ['女生', '手机版', '个人空间', '查询', '侠盗', '小说', '163', '背景音乐', '女生', 'txt'],
             ['中国', '视频', '在线', '下载', '观看', '图片', '手机版', '小说', '倾城', '微微一笑'],
             ['最新', '新', '年', '没', '百度', '大学排名', '人物', '学院', '约', '教学'],
             ['魔兽争霸', '月', '异界', '学', '幸福生活', '下载', 'txt', '告白', '奇迹', '游戏']]

In [None]:
# 将种子关键词和竞争词合并成一个字典
compkeywords = dict(zip(seedwords, comp_list))
compkeywords

In [None]:
# 取出每个种子关键词的中介关键词的keyword，取成二维列表
midkeywords_list = []
for seedword in seedwords:
    midkeywords_list.append([midkey['keyword'] for midkey in mid_word_dict[seedword]])
midkeywords_list

In [None]:
# 每个中介关键词对应的竞争性关键词
# 种子关键词：{竞争关键词：中介关键词}
compwords = dict.fromkeys(seedwords, {})
i = 0
for each_list in midkeywords_list:
    compwords[seedwords[i]] = dict(zip(each_list, comp_list[i]))
    i += 1
compwords

In [None]:
# 将|{ka}|的值存在字典ka_query_list中
# 种子关键词：{竞争关键词：ka}
ka_query_volume = dict.fromkeys(seedwords, {})
i = 0
for each_list in midkeywords_list:
    ka_query_volume[seedwords[i]] = dict(zip(each_list, ka_list[i]))
    i += 1
ka_query_volume

## 计算竞争性关键字的竞争度

Comps(k, s) = |{ka}| / (|{a}| - |{sa}|)

计算 |{a}|

In [None]:
a_query_volume = dict.fromkeys(seedwords, {})


# 计算所有中介关键词的搜索量|{a}|
def count_midkeyword(seedword):
    # count_dict=dict.fromkeys(midkeywords[seedword],0)
    count_dict = dict.fromkeys(midkeywords_list[seedwords.index(seedword)], 0)
    query_data = open('../data/temp/cleaned.train', 'r', encoding='utf-8')
    for sentence in query_data:
        # for midkeyword in midkeywords[seedword]:
        for midkeyword in midkeywords_list[seedwords.index(seedword)]:
            if midkeyword in sentence:
                count_dict[midkeyword] += 1
    query_data.close()
    return count_dict


for seedword in seedwords:
    print(seedword)
    a_query_volume[seedword] = count_midkeyword(seedword)

a_query_volume

In [None]:


sa_query_volume = dict.fromkeys(seedwords, {})


# 把midkey_dict中中的keyword和freq对应起来，存入sa_query_volume中
def get_sa_query_volume():
    for seedword in seedwords:
        sa_query_volume[seedword] = {}
        for midkeyword in mid_word_dict[seedword]:
            sa_query_volume[seedword][midkeyword['keyword']] = midkeyword['freq']
    return sa_query_volume


get_sa_query_volume()

sa_query_volume

In [None]:

# 计算Comp
import copy


def getcomp():
    comp_query_volume = copy.deepcopy(sa_query_volume)
    for seedword in a_query_volume:
        for midkeyword in a_query_volume[seedword]:
            comp_query_volume[seedword][midkeyword] = ka_query_volume[seedword][midkeyword] / (
                    a_query_volume[seedword][midkeyword] - sa_query_volume[seedword][midkeyword])
    return comp_query_volume


result_query_volume = getcomp()
result_query_volume


In [None]:
# 计算w_midkeyword
w_midkeyword = dict.fromkeys(seedwords, {})
for seedword in seedwords:
    w_midkeyword[seedword] = {}
    for midkeyword in mid_word_dict[seedword]:
        w_midkeyword[seedword][midkeyword['keyword']] = midkeyword['weight']
w_midkeyword

In [None]:
def comp_result():
    result = {}
    for seedword in result_query_volume:
        result.setdefault(seedword, {})
        i = 0
        for midkeyword in result_query_volume[seedword]:
            result[seedword][compkeywords[seedword][i]] = w_midkeyword[seedword][midkeyword] * \
                                                          result_query_volume[seedword][midkeyword]
            i += 1
    return result


comp_k_s = comp_result()
#关键词k与种子关键词s的竞争性程度

# 每个种子关键词的竞争性关键词根据竞争度排序
for seedword in seedwords:
    comp_k_s[seedword] = dict(sorted(comp_k_s[seedword].items(), key=lambda x: x[1], reverse=True))
comp_k_s

In [None]:
# 把每个种子关键词的竞争性关键词和竞争度存储到字典中
comp_result_dict = {}
for seedword in seedwords:
    comp_result_dict[seedword] = {}
    # comp_result_dict[seedword][compkeywords[seedword]] = comp_k_s[seedword]
    for compword in compkeywords[seedword]:
        comp_result_dict[seedword][compword] = comp_k_s[seedword][compword]

# 给每个种子关键词的竞争性关键词和竞争度排序
for seedword in seedwords:
    comp_result_dict[seedword] = dict(sorted(comp_result_dict[seedword].items(), key=lambda x: x[1], reverse=True))

# 排除单个字的竞争性关键词
for seedword in seedwords:
    comp_result_dict[seedword] = {k: v for k, v in comp_result_dict[seedword].items() if len(k) > 1}

# 排除txt
for seedword in seedwords:
    comp_result_dict[seedword] = {k: v for k, v in comp_result_dict[seedword].items() if k != 'txt'}

# # 取出每个种子关键词的前五个竞争性关键词
# for seedword in seedwords:
#     comp_result_dict[seedword] = dict(list(comp_result_dict[seedword].items())[:5])


comp_result_dict