In [None]:
#  朴素贝叶斯
from numpy import *
from os import listdir
import codecs  # 字符转换模块，用于文本的编码和解码
import jieba  # 中文分词库
import re
from sklearn.naive_bayes import MultinomialNB
from collections import Counter
from itertools import chain  # 用于串联迭代对象


In [None]:
#  构建文本处理函数
def segment2word(doc: str):
    # 从 stop_list.txt 文件中提取停用词，存储为列表
    stop_words = codecs.open("./ML/04/stop_list.txt",
                             "r", "UTF-8").read().splitlines()
    doc = re.sub('[\t\r\n]', ' ', doc)  # 去除邮件文本中的缩进，换行等
    word_list = list(jieba.cut(doc.strip()))  # 用 jieba 进行分词
    out_str = ''
    for word in word_list:  # 删去邮件文本中的停用词
        if word == ' ' or word == '':
            continue
        if word not in stop_words:
            out_str += word.strip()
            out_str += ' '
    segments = out_str.strip().split(sep=' ')
    return segments


In [None]:
# 构建文本读取函数
def getDatafromDir(data_dir):
    docLists = []
    docLabels = [f for f in listdir(
        data_dir) if f.endswith('.txt')]  # 存储每一封邮件的名称
    for doc in docLabels:
        try:
            filepath = data_dir + "/" + doc
            # 对训练集的邮件进行文本处理
            wordList = segment2word(codecs.open(filepath, "r", "UTF-8").read())
            docLists.append(wordList)  # 整合训练集的邮件处理后的结果
        except:
            print("handling file %s is error!!" % filepath)
    return docLists


In [None]:
# 构建数据集
spamDocList = getDatafromDir("./ML/04/email/spam/")  # 对垃圾邮件进行文本处理
hamDocList = getDatafromDir("./ML/04/email/ham/")  # 对正常邮件进行文本处理
fullDocList = spamDocList + hamDocList  # 储存邮件的特征
# 添加标签，垃圾邮件标记为 1，正常邮件标记为 0
classList = array([1]*len(spamDocList)+[0]*len(hamDocList))
frequencyDic = Counter(chain(*fullDocList))  # 生成词频映射词典
topWords = [w[0] for w in frequencyDic.most_common(500)]  # 获取前 500 个最频繁的热词。
vector = []
for docList in fullDocList:
    # 统计每封邮件中每个热词出现的频率
    topwords_list = list(map(lambda x: docList.count(x), topWords))
    vector.append(topwords_list)
# 生成 vector 作为数据特征
vector = array(vector)


In [None]:
# 模型训练
model = MultinomialNB()  # 选取多项式贝叶斯为训练模型
model.fit(vector, classList)  # vector 为特征，classlist 为标签，训练贝叶斯模型


In [None]:
# 模型测试
# 存储每一封训练集邮件的名称
dataList = []
test_dir = "./ML/04/email/spam/"
docLabels = [f for f in listdir(test_dir) if f.endswith('.txt')]
# 模型推理
for doc in docLabels:
    try:
        filepath = test_dir + "/" + doc
        dataList = segment2word(codecs.open(filepath, "r", "UTF-8").read())
    except:
        print("handling file %s is error!!" % filepath)
# 统计测试集邮件中的热词的词频，提取特征
    testVector = array(tuple(map(lambda x: dataList.count(x), topWords)))
    testVector_reshape = testVector.reshape(1, -1)
    # 特征传入模型进行推理
    predicted_label = model.predict(testVector.reshape(1, -1))
    if(predicted_label == 1):
        print("%s is spam mail" % doc)
    else:
        print("%s is NOT spam mail" % doc)
