In [8]:
import re
import os
from jieba import cut
from itertools import chain
from collections import Counter
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer


In [10]:
def get_words(filename):
    """读取文本并过滤无效字符和长度为1的词"""
    words = []
    with open(filename, 'r', encoding='utf-8') as fr:
        for line in fr:
            line = line.strip()
            # 过滤无效字符
            line = re.sub(r'[.【】0-9、——。，！~\*]', '', line)
            # 使用jieba.cut()方法对文本切词处理
            line = cut(line)
            # 过滤长度为1的词
            line = filter(lambda word: len(word) > 1, line)
            words.extend(line)
    return words

In [11]:
def get_text_content(filename):
    """读取文本内容并返回字符串，用于TF-IDF处理"""
    with open(filename, 'r', encoding='utf-8') as fr:
        content = fr.read()
        # 过滤无效字符
        content = re.sub(r'[.【】0-9、——。，！~\*]', '', content)
        return content

In [12]:
all_words = []
def get_top_words(top_num):
    """遍历邮件建立词库后返回出现次数最多的词"""
    filename_list = ['邮件_files/{}.txt'.format(i) for i in range(151)]
    # 遍历邮件建立词库
    for filename in filename_list:
        all_words.append(get_words(filename))
    # itertools.chain()把all_words内的所有列表组合成一个列表
    # collections.Counter()统计词个数
    freq = Counter(chain(*all_words))
    return [i[0] for i in freq.most_common(top_num)]

In [13]:
def extract_features(feature_type='high_freq', top_num=100):
    """特征提取函数，根据feature_type参数选择特征提取方式
    
    参数:
        feature_type: 特征提取方式，'high_freq'表示高频词特征，'tfidf'表示TF-IDF加权特征
        top_num: 选取的特征词数量
    
    返回:
        feature_matrix: 特征矩阵
        vectorizer: 如果使用TF-IDF特征，返回TfidfVectorizer对象；否则返回None
    """
    if feature_type == 'high_freq':
        # 使用高频词特征
        top_words = get_top_words(top_num)
        vector = []
        for words in all_words:
            word_map = list(map(lambda word: words.count(word), top_words))
            vector.append(word_map)
        return np.array(vector), top_words
    
    elif feature_type == 'tfidf':
        # 使用TF-IDF加权特征
        filename_list = ['邮件_files/{}.txt'.format(i) for i in range(151)]
        corpus = []
        for filename in filename_list:
            corpus.append(get_text_content(filename))
        
        # 初始化TfidfVectorizer
        tfidf_vectorizer = TfidfVectorizer(
            tokenizer=lambda x: [w for w in cut(x) if len(w) > 1],
            max_features=top_num
        )
        # 提取TF-IDF特征
        tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
        return tfidf_matrix, tfidf_vectorizer
    
    else:
        raise ValueError("Invalid feature_type. Must be 'high_freq' or 'tfidf'")

In [14]:
def train_model(feature_type='high_freq', top_num=100):
    """训练模型
    
    参数:
        feature_type: 特征提取方式，'high_freq'表示高频词特征，'tfidf'表示TF-IDF加权特征
        top_num: 选取的特征词数量
        
    返回:
        model: 训练好的MultinomialNB模型
        vectorizer: 如果使用TF-IDF特征，返回TfidfVectorizer对象；否则返回top_words列表
    """
    # 清空all_words列表，避免多次调用时重复添加
    global all_words
    all_words = []
    
    # 提取特征
    feature_matrix, vectorizer = extract_features(feature_type, top_num)
    
    # 0-126.txt为垃圾邮件标记为1；127-151.txt为普通邮件标记为0
    labels = np.array([1]*127 + [0]*24)
    
    # 训练模型
    model = MultinomialNB()
    model.fit(feature_matrix, labels)
    
    return model, vectorizer


In [15]:
def predict(filename, model, vectorizer, feature_type='high_freq'):
    """对未知邮件分类
    
    参数:
        filename: 邮件文件名
        model: 训练好的MultinomialNB模型
        vectorizer: 如果使用TF-IDF特征，为TfidfVectorizer对象；否则为top_words列表
        feature_type: 特征提取方式，'high_freq'表示高频词特征，'tfidf'表示TF-IDF加权特征
    
    返回:
        预测结果，'垃圾邮件'或'普通邮件'
    """
    if feature_type == 'high_freq':
        # 构建未知邮件的高频词向量
        top_words = vectorizer
        words = get_words(filename)
        current_vector = np.array(
            tuple(map(lambda word: words.count(word), top_words)))
        # 预测结果
        result = model.predict(current_vector.reshape(1, -1))
    
    elif feature_type == 'tfidf':
        # 构建未知邮件的TF-IDF向量
        tfidf_vectorizer = vectorizer
        text_content = get_text_content(filename)
        current_vector = tfidf_vectorizer.transform([text_content])
        # 预测结果
        result = model.predict(current_vector)
    
    else:
        raise ValueError("Invalid feature_type. Must be 'high_freq' or 'tfidf'")
        
    return '垃圾邮件' if result == 1 else '普通邮件'

In [None]:
# 使用高频词特征训练模型和预测
print("使用高频词特征进行训练和预测:")
high_freq_model, high_freq_vectorizer = train_model(feature_type='high_freq', top_num=100)
print('151.txt分类情况:{}'.format(predict('邮件_files/151.txt', high_freq_model, high_freq_vectorizer, 'high_freq')))
print('152.txt分类情况:{}'.format(predict('邮件_files/152.txt', high_freq_model, high_freq_vectorizer, 'high_freq')))
print('153.txt分类情况:{}'.format(predict('邮件_files/153.txt', high_freq_model, high_freq_vectorizer, 'high_freq')))
print('154.txt分类情况:{}'.format(predict('邮件_files/154.txt', high_freq_model, high_freq_vectorizer, 'high_freq')))
print('155.txt分类情况:{}'.format(predict('邮件_files/155.txt', high_freq_model, high_freq_vectorizer, 'high_freq')))

使用高频词特征进行训练和预测:
151.txt分类情况:垃圾邮件
152.txt分类情况:垃圾邮件
153.txt分类情况:垃圾邮件
154.txt分类情况:垃圾邮件
155.txt分类情况:普通邮件


In [17]:
# 使用TF-IDF加权特征训练模型和预测
print("使用TF-IDF加权特征进行训练和预测:")
tfidf_model, tfidf_vectorizer = train_model(feature_type='tfidf', top_num=100)
print('151.txt分类情况:{}'.format(predict('邮件_files/151.txt', tfidf_model, tfidf_vectorizer, 'tfidf')))
print('152.txt分类情况:{}'.format(predict('邮件_files/152.txt', tfidf_model, tfidf_vectorizer, 'tfidf')))
print('153.txt分类情况:{}'.format(predict('邮件_files/153.txt', tfidf_model, tfidf_vectorizer, 'tfidf')))
print('154.txt分类情况:{}'.format(predict('邮件_files/154.txt', tfidf_model, tfidf_vectorizer, 'tfidf')))
print('155.txt分类情况:{}'.format(predict('邮件_files/155.txt', tfidf_model, tfidf_vectorizer, 'tfidf')))

使用TF-IDF加权特征进行训练和预测:




151.txt分类情况:垃圾邮件
152.txt分类情况:垃圾邮件
153.txt分类情况:垃圾邮件
154.txt分类情况:垃圾邮件
155.txt分类情况:垃圾邮件
