# 利用真实的数据来验证朴素贝叶斯 英文文本

## 1 准备数据, 生成词汇表

In [18]:
import re
import numpy as np
import random as r

In [5]:
# 接收一段文字, 转换为字符串列表
def txtParse(bigString):
    listOfTokens = re.split('\W*', bigString)
    return [tok.lower() for tok in listOfTokens]

In [3]:
# 去重, 生成词汇表
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

In [17]:
# 将每一段输入文字转换为向量
def bagOfWords2vecMN(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

In [28]:
# 计算贝叶斯分类器的各种先验概率
def trainNB(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAb = sum(trainCategory) / float(numTrainDocs)
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Denom += 1
            p1Num += trainMatrix[i]
        else:
            p0Denom += 1
            p0Num += trainMatrix[i]
    p0Vec = np.log(p0Num / p1Denom)
    p1Vec = np.log(p1Num / p1Denom)
    return p0Vec, p1Vec, pAb

In [20]:
# 朴素贝叶斯分类器函数
def classifyNB(vec2classify, p0Vec, p1Vec, pAb):
    p0 = sum(vec2classify * p0Vec) + np.log(pAb)
    p1 = sum(vec2classify * p1Vec) + np.log(pAb)
    if p0 > p1:
        return 0
    else:
        return 1

In [54]:
if __name__ == '__main__':
    docList = []
    classList = []
    for i in range(1, 26):
        wordList = txtParse(open('./Machine-Learning-master/Naive Bayes/email/spam/%d.utf8.converted' % i, 'r').read())
        docList.append(wordList)
        classList.append(1)
        wordList = txtParse(open('./Machine-Learning-master/Naive Bayes/email/ham/%d.utf8.converted' % i, 'r').read())
        docList.append(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainSet = list(range(50))
    testSet = []
    for i in range(10):
        randIndex = int(r.random() * len(trainSet))
        testSet.append(trainSet[randIndex])
        del trainSet[randIndex]
    trainMat = []
    trainClasses = []
    for docIndex in trainSet:
        trainMat.append(bagOfWords2vecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pAb = trainNB(trainMat, trainClasses)
    errorCount = 0
    for docIndex in testSet:
        word2Vec = bagOfWords2vecMN(vocabList, docList[docIndex])
        if classifyNB(word2Vec, p0V, p1V, pAb) != classList[docIndex]:
            errorCount += 1
            print("分类错误的测试集", docList[docIndex])
    print("错误率 %.2f " % (errorCount / len(testSet)*100) )
#     print(vocabList) 

分类错误的测试集 ['experience', 'with', 'biggerpenis', 'today', 'grow', '3', 'inches', 'more', 'the', 'safest', 'most', 'effective', 'methods', 'of_penisen1argement', 'save', 'your', 'time', 'and', 'money', 'bettererections', 'with', 'effective', 'ma1eenhancement', 'products', '1', 'ma1eenhancement', 'supplement', 'trusted', 'by', 'millions', 'buy', 'today', '']
分类错误的测试集 ['oem', 'adobe', 'microsoft', 'softwares', 'fast', 'order', 'and', 'download', 'microsoft', 'office', 'professional', 'plus', '2007', '2010', '129', 'microsoft', 'windows', '7', 'ultimate', '119', 'adobe', 'photoshop', 'cs5', 'extended', 'adobe', 'acrobat', '9', 'pro', 'extended', 'windows', 'xp', 'professional', 'thousand', 'more', 'titles']
错误率 20.00 


  return _compile(pattern, flags).split(string, maxsplit)
