## 基于概率论的分类方法：朴素贝叶斯    
**称为“朴素”是因为整个形式化过程只做最原始、最简单的假设。**     
优点：      
* 在数据较少的情况下仍然有效，可以处理多类别问题。    
缺点：     
* 对于输入数据的准备方式较为敏感。      
适用数据类型：标称型数据。       

**贝叶斯决策理论的核心思想：选择高概率对应的类型作为输入数据的类型。**

朴素贝叶斯的一般过程：
1. 收集数据：可以使用任何方法。本章使用RSS源。
2. 准备数据：需要数值型或者布尔型数据
3. 分析数据：有大量特征时，绘制特征作用不大，此时使用直方图效果更好。
4. 训练算法：计算不同的独立特征的条件概率。
5. 测试算法：计算错误率。
6. 使用算法：一个常见的朴素贝叶斯应用是文档分类。可以在任意的分类场景中使用朴素贝叶斯分类器，不一定非要是文本。

In [65]:
from numpy import *

In [66]:
#词表到向量的转换函数
def loadDataSet():
    positingList=[['my', 'dog', 'has', 'flea',  'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how','to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]               #1代表侮辱性文字，0代表正常言论
    return positingList, classVec

#创建一个包含在所有文档中出现的不重复词的列表
def createVocabList(dataSet):
    #创建一个空集
    vocabSet = set([])
    for document in dataSet:
        #创建两个空集的并集
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

#获得文档向量
def setOfWords2Vec(vocabList, inputSet):
    #创建一个其中所含元素都为0的向量
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in inputSet:
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word: %s is not in my Vocabulary!" % word)
    return returnVec
    

In [15]:
listOPsts,listClasses=loadDataSet()

In [16]:
myVocabList=sorted(createVocabList(listOPsts))

In [17]:
myVocabList

['I',
 'ate',
 'buying',
 'cute',
 'dalmation',
 'dog',
 'flea',
 'food',
 'garbage',
 'has',
 'help',
 'him',
 'how',
 'is',
 'licks',
 'love',
 'maybe',
 'mr',
 'my',
 'not',
 'park',
 'please',
 'posting',
 'problems',
 'quit',
 'so',
 'steak',
 'stop',
 'stupid',
 'take',
 'to',
 'worthless']

In [18]:
setOfWords2Vec(myVocabList, listOPsts[0])

[0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

### 训练算法：从词向量计算概率：        

$$p(c_i|w)=\frac{p(w|c_i)p(c_i)}{p(w)}$$      

函数的伪代码为：      
* 计算每个类别中的文档数目      
* 对每篇训练文档：      
    * 对每个类别：       
        * 如果词条出现在文档中→ 增加该词条的计数值      
        * 增加所有词条的计数值      
* 对每个类别：      
     * 对每个词条：        
        * 将该词条的数目除以总词条数目得到条件概率      
* 返回每个类别的条件概率　　

In [67]:
#朴素贝叶斯分类器训练函数
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    #初始化概率
    p0Num = ones(numWords)
    p1Num = ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            #向量相加
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
        #对每个元素做除法
    p1Vect = log(p1Num / p1Denom)       #change to log()
    p0Vect = log(p0Num / p0Denom)       #change to log()
    return p0Vect, p1Vect, pAbusive

In [52]:
listOPosts, listClasses = loadDataSet()

In [53]:
myVocabList = createVocabList(listOPosts)

In [54]:
trainMat = []
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))

In [55]:
p0V, p1V, pAb = trainNB0(trainMat, listClasses)

In [56]:
p0V

array([-3.25809654, -3.25809654, -2.56494936, -2.56494936, -2.56494936,
       -2.56494936, -2.56494936, -2.56494936, -2.56494936, -2.56494936,
       -2.56494936, -3.25809654, -2.56494936, -2.56494936, -3.25809654,
       -2.56494936, -3.25809654, -3.25809654, -1.87180218, -3.25809654,
       -2.56494936, -3.25809654, -3.25809654, -3.25809654, -2.56494936,
       -2.15948425, -3.25809654, -2.56494936, -2.56494936, -2.56494936,
       -2.56494936, -2.56494936])

In [57]:
p1V

array([-2.35137526, -1.94591015, -3.04452244, -3.04452244, -3.04452244,
       -2.35137526, -3.04452244, -3.04452244, -3.04452244, -3.04452244,
       -3.04452244, -2.35137526, -3.04452244, -1.94591015, -2.35137526,
       -3.04452244, -2.35137526, -1.65822808, -3.04452244, -2.35137526,
       -3.04452244, -2.35137526, -2.35137526, -2.35137526, -2.35137526,
       -2.35137526, -2.35137526, -3.04452244, -3.04452244, -3.04452244,
       -3.04452244, -3.04452244])

In [58]:
pAb

0.5

### 朴素贝叶斯分类函数


In [68]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    #元素相乘
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

    #遍历函数，封装所有的操作
def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for positinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0v, p1v, pAb = trainNB0(array(trainMat), array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0v, p1v, pAb))
    testEntry1 = ['stupid', 'garbage']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry1))
    print(testEntry1, 'classified as: ', classifyNB(thisDoc, p0v, p1v, pAb))

In [69]:
testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  0


### 文档词袋模型

In [70]:
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

### 使用朴素贝叶斯过滤垃圾邮件        
1. 收集数据：提供文本文件。 
2. 准备数据：将文本文件解析成词条向量。
3. 分析数据：检查词条确保解析的正确性。 
4. 训练算法：使用我们之前建立的trainNB0()函数。 
5. 测试算法：使用classifyNB()，并且构建一个新的测试函数来计算文档集的错误率。 
6. 使用算法：构建一个完整的程序对一组文档进行分类，将错分的文档输出到屏幕上

In [109]:
#文件解析及完整的垃圾邮件测试函数
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        #导入并解析文本文件
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i, encoding='gb18030',errors='ignore').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainingSet = list(range(50))
    testSet = []
    #随机构建训练集
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    #对测试集分类
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ', float(errorCount) / len(testSet))

In [110]:
spamTest()

the error rate is:  0.1


  return _compile(pattern, flags).split(string, maxsplit)


In [111]:
spamTest()

the error rate is:  0.1


  return _compile(pattern, flags).split(string, maxsplit)
