## 数据预处理



In [None]:
import os.path

train_data = open('../data/raw/sogou_rematch/user_tag_query.10W.TRAIN', 'r', encoding='gb18030')
OUTPUT_DATA_PATH = '../data/processed/query_words.train'

try:
    if os.path.exists(OUTPUT_DATA_PATH):
        os.remove(OUTPUT_DATA_PATH)

    output_data = open(OUTPUT_DATA_PATH, 'w', encoding='utf-8')

    for line in train_data:
        line_list = line.split('\t') # 以制表符分割
        line_list = line_list[4:] # 只保留查询词
        output_line = '\n'.join(line_list) # 以换行符连接
        output_data.write(output_line) # 写入文件
finally:
    train_data.close()
    output_data.close()

In [None]:
import re
import os.path

OUTPUT_DATA_PATH = '../data/processed/cleaned.train'

try:
    train_data = open('../data/processed/query_words.train', 'r', encoding='utf-8')
    if os.path.exists(OUTPUT_DATA_PATH):
        os.remove(OUTPUT_DATA_PATH)

    output_data = open(OUTPUT_DATA_PATH, 'w', encoding='utf-8')

    for line in train_data:
        word_list = line.split('\t')
        pattern = re.compile('(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]')
        # 如果字符串包含网址，则跳过
        if pattern.search(word_list[0]):
            continue
        line_string = '\n'.join(word_list)
        output_data.write(line_string)
finally:
    train_data.close()
    output_data.close()

## 分词

In [None]:
import jieba
# 读取cleaned.train文件
train_data = open('../data/processed/cleaned.train', 'r', encoding='utf-8')
output_data = open('../data/processed/seg_list.train', 'w', encoding='utf-8')
# 逐行处理
for line in train_data:
    line = line.strip()
    temp_seg_list = list(jieba.cut(line))
    # 去除空格
    temp_seg_list = [word for word in temp_seg_list if word != ' ']
    output_data.write('\n'.join(temp_seg_list))
    # 换行
    output_data.write('\n')
# 关闭文件
print('分词处理完成！')
train_data.close()
output_data.close()

## 合并停用词文件

In [None]:
# 停用词文件路径
stopwords_file_1 = '../data/stop_words/baidu_stopwords.txt'  # 第一个停用词文件
stopwords_file_2 = '../data/stop_words/cn_stopwords.txt'  # 第二个停用词文件
output_file = '../data/stop_words/merge_stopwords.txt'  # 输出合并后的停用词文件

# 1. 读取第一个停用词文件
stopwords = set()  # 使用集合以避免重复

with open(stopwords_file_1, 'r', encoding='utf-8') as file:
    for line in file:
        stopwords.add(line.strip())  # 去除行首尾空白并添加到集合中

# 2. 读取第二个停用词文件
with open(stopwords_file_2, 'r', encoding='utf-8') as file:
    for line in file:
        stopwords.add(line.strip())  # 去除行首尾空白并添加到集合中

# 3. 将合并后的停用词写入输出文件
with open(output_file, 'w', encoding='utf-8') as file:
    for word in sorted(stopwords):  # 可选：按字母顺序排序
        file.write(word + '\n')  # 每个停用词写入一行

## 过滤词

In [None]:
# 1. 加载停用词
stopwords_file = '../data/stop_words/merge_stopwords.txt'  # 停用词文件路径
stopwords = set()

with open(stopwords_file, 'r', encoding='utf-8') as file:
    for line in file:
        stopwords.add(line.strip())

# 2. 读取已分词的训练数据并过滤停用词
train_file = '../data/processed/seg_list.train'  # 已分词的训练数据路径
output_file = '../data/processed/filter_list.train'  # 过滤后的训练数据路径

with open(train_file, 'r', encoding='utf-8') as train_data, \
     open(output_file, 'w', encoding='utf-8') as output_data:

    for line in train_data:
        line = line.strip()  # 去除行首尾空白
        words = line.split()  # 将分词结果按空格拆分
        # 过滤停用词
        filtered_words = [word for word in words if word not in stopwords]
        if filtered_words:  # 确保不写入空行
            output_data.write(' '.join(filtered_words) + '\n')  # 以空格连接过滤后的词


## 选取关键词

In [None]:
# 导入必要的库
from collections import Counter

def read_and_count_words(filename, exclude_single=True, exclude_specific=None):
    # 读取文件内容并将每一行作为一个单词
    with open(filename, 'r', encoding='utf-8') as file:
        words = [line.strip() for line in file]
    # 去除单个字的词语
    if exclude_single:
        words = [word for word in words if len(word) > 1]
    
    # 去除特定的词语
    if exclude_specific:
        words = [word for word in words if word not in exclude_specific]
    # 使用Counter来统计每个词的出现频率
    word_counts = Counter(words)
    
    # 获取出现频率最高的前20个词语
    most_common_20 = word_counts.most_common(20)
    
    return most_common_20

# 调用函数，并打印结果
filename = '../data/processed/filter_list.train'  
exclude_specific = ['2016']
most_common_20 = read_and_count_words(filename, exclude_single=True, exclude_specific=exclude_specific)

# 打印出前20个最常见的词语及其频率
for word, freq in most_common_20:
    print(f'{word}: {freq}')