In [16]:
import re
import os
from jieba import cut
from itertools import chain
from collections import Counter
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

In [17]:
test_files = ['邮件_files/151.txt', '邮件_files/152.txt']

In [18]:
def get_words(filename):
    """读取文本并过滤无效字符和长度为1的词"""
    words = []
    with open(filename, 'r', encoding='utf-8') as fr:
        for line in fr:
            line = line.strip()
            # 过滤无效字符
            line = re.sub(r'[【】0-9、——。，！~\*]', '', line)
            # 使用jieba.cut()方法对文本切词处理
            line = cut(line)
            # 过滤长度为1的词
            line = filter(lambda word: len(word) > 1, line)
            words.extend(line)
    return words

In [19]:
def get_top_words(top_num, file_list):
    """遍历邮件建立词库后返回出现次数最多的词"""
    all_words = []
    for filename in file_list:
        all_words.extend(get_words(filename))
    # 统计词频
    freq = Counter(all_words)
    return [i[0] for i in freq.most_common(top_num)]

In [20]:
def get_tfidf_features(file_list, top_num=None):
    """使用TF-IDF提取特征"""
    texts = []
    for filename in file_list:
        with open(filename, 'r', encoding='utf-8') as fr:
            text = fr.read()
            # 过滤无效字符
            text = re.sub(r'[【】0-9、——。，！~\*]', '', text)
            texts.append(text)
    vectorizer = TfidfVectorizer(max_features=top_num)
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix, vectorizer


In [21]:
def preprocess(file_list, feature_type='high_freq', top_num=100):
    """数据预处理，支持高频词和TF-IDF特征提取"""
    if feature_type == 'high_freq':
        all_words = []
        for filename in file_list:
            all_words.extend(get_words(filename))
        freq = Counter(all_words)
        top_words = [i[0] for i in freq.most_common(top_num)]
        # 构建词向量
        vectors = []
        for filename in file_list:
            words = get_words(filename)
            word_map = [words.count(word) for word in top_words]
            vectors.append(word_map)
        return np.array(vectors), top_words
    elif feature_type == 'tfidf':
        vectors, vectorizer = get_tfidf_features(file_list, top_num)
        return vectors, vectorizer
    else:
        raise ValueError("Unsupported feature type. Choose 'high_freq' or 'tfidf'.")

In [22]:
def balance_samples(X, y):
    """使用SMOTE进行样本平衡"""
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

In [23]:
def train_and_predict(file_list, labels, feature_type='high_freq', top_num=100, balance=False):
    """训练模型并进行预测"""
    # 数据预处理
    if feature_type == 'high_freq':
        X, top_words = preprocess(file_list, feature_type, top_num)
        if balance:
            X_resampled, y_resampled = balance_samples(X, labels)
            model = MultinomialNB()
            model.fit(X_resampled, y_resampled)
        else:
            model = MultinomialNB()
            model.fit(X, labels)
        return model, top_words  # 返回模型和 top_words
    elif feature_type == 'tfidf':
        X, vectorizer = preprocess(file_list, feature_type, top_num)
        if balance:
            # TF-IDF 矩阵需要转换为密集矩阵
            X_dense = X.toarray()
            X_resampled, y_resampled = balance_samples(X_dense, labels)
            model = MultinomialNB()
            model.fit(X_resampled, y_resampled)
        else:
            model = MultinomialNB()
            model.fit(X, labels)
        return model, vectorizer  # 返回模型和 vectorizer
    else:
        raise ValueError("Unsupported feature type. Choose 'high_freq' or 'tfidf'.")

In [25]:
def predict_file(model, feature_type, filename, top_words=None, vectorizer=None):
    """对未知邮件分类"""
    if feature_type == 'high_freq':
        # 高频词特征
        words = get_words(filename)
        word_map = [words.count(word) for word in top_words]
        current_vector = np.array(word_map).reshape(1, -1)
    elif feature_type == 'tfidf':
        # TF-IDF特征
        with open(filename, 'r', encoding='utf-8') as fr:
            text = fr.read()
            # 过滤无效字符
            text = re.sub(r'[【】0-9、——。，！~\*]', '', text)
            current_vector = vectorizer.transform([text])
    else:
        raise ValueError("Unsupported feature type. Choose 'high_freq' or 'tfidf'.")
    
    # 预测结果
    result = model.predict(current_vector)
    return '垃圾邮件' if result[0] == 1 else '普通邮件'

In [26]:
if __name__ == "__main__":
    # 文件列表
    file_list = ['邮件_files/{}.txt'.format(i) for i in range(1, 152)]
    labels = np.array([1] * 127 + [0] * 24)

    # 特征选择：高频词特征
    print("使用高频词特征进行训练和预测...")
    model_high_freq, top_words = train_and_predict(file_list, labels.copy(), feature_type='high_freq', top_num=100, balance=True)

    # 特征选择：TF-IDF特征
    print("使用TF-IDF特征进行训练和预测...")
    model_tfidf, vectorizer = train_and_predict(file_list, labels.copy(), feature_type='tfidf', top_num=100, balance=True)

    # 预测示例
    test_files = ['邮件_files/151.txt', '邮件_files/152.txt']
    for file in test_files:
        print(f"{file} 分类情况（高频词特征）：{predict_file(model_high_freq, 'high_freq', file, top_words=top_words)}")
        print(f"{file} 分类情况（TF-IDF特征）：{predict_file(model_tfidf, 'tfidf', file, vectorizer=vectorizer)}")

使用高频词特征进行训练和预测...
使用TF-IDF特征进行训练和预测...
邮件_files/151.txt 分类情况（高频词特征）：普通邮件
邮件_files/151.txt 分类情况（TF-IDF特征）：普通邮件
邮件_files/152.txt 分类情况（高频词特征）：垃圾邮件
邮件_files/152.txt 分类情况（TF-IDF特征）：垃圾邮件
