# Naive Bayes

## 4.5.1 准备数据：从文本中构建词向量

In [1]:
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 代表侮辱性文字
    return postingList,classVec

def createVocabList(dataSet):
    vocabList=set()
    for document in dataSet:
        vocabList=vocabList|set(document)
    return list(vocabList)

def setOfWords2Vec(vocabList,inputSet):
    returnVec=[0]*len(vocabList)
    for word in inputSet:
        if(word in vocabList):
            returnVec[vocabList.index(word)]=1
        else:
            print 'the world :%s is not in my Vocabulary!'%word
    return returnVec

In [2]:
listOfPosts,listClasses=loadDataSet()
myVocabList=createVocabList(listOfPosts)
setOfWords2Vec(myVocabList,listOfPosts[0])

[0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1]

## 4.5.2 训练算法：从词向量计算概率

In [3]:
import numpy as np
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs=len(trainMatrix)
    numWords=len(trainMatrix[0])

    ##拉普拉斯修正
    pAbusive=1.0*(sum(trainCategory)+1)/(numTrainDocs+2) ## p(class=1)=(|D1|+1)/(|D|+N)
    p0Num=np.ones(numWords) ##分子修正 |D0，xi|+1
    p1Num=np.ones(numWords) ##分子修正 |D1，xi|+1
    p0Denom=2##分母修正 |D0|+Ni
    p1Denom=2##分母修正 |D1|+Ni
    for i in range(numTrainDocs):
        if(trainCategory[i]==1):
            p1Num+=trainMatrix[i]
            p1Denom+=sum(trainMatrix[i])
        else:
            p0Num+=trainMatrix[i]
            p0Denom+=sum(trainMatrix[i])
    ##防止下溢出
    p1Vect=np.log(1.0*p1Num/p1Denom) ##p(xi|class=0)
    p0Vect=np.log(1.0*p0Num/p0Denom) ##p(xi|class=1)
    return p0Vect,p1Vect,pAbusive

In [4]:
trainMat=[]
for postinDoc in listOfPosts:
    trainMat.append(setOfWords2Vec(myVocabList,postinDoc))

trainMat

[[0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0],
 [1,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1],
 [0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  1],
 [0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0]]

In [5]:
p0V,p1V,pAb=trainNB0(trainMat,listClasses)

In [6]:
p0V

array([-2.56494936, -2.56494936, -2.56494936, -3.25809654, -3.25809654,
       -2.56494936, -2.56494936, -2.56494936, -3.25809654, -2.56494936,
       -2.56494936, -2.56494936, -2.56494936, -3.25809654, -3.25809654,
       -2.15948425, -3.25809654, -3.25809654, -2.56494936, -3.25809654,
       -2.56494936, -2.56494936, -3.25809654, -2.56494936, -2.56494936,
       -2.56494936, -3.25809654, -2.56494936, -3.25809654, -2.56494936,
       -2.56494936, -1.87180218])

## 4.5.3 测试算法：根据现实情况修改分类器

In [7]:
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1=np.sum(vec2Classify*p1Vec)+np.log(pClass1)
    p0=np.sum(vec2Classify*p0Vec)+np.log(1-pClass1)
    return 1 if(p1>p0) else 0

def testingNB():
    listOfPosts,listClasses=loadDataSet()
    myVocabList=createVocabList(listOfPosts)
    trainMat=[]
    for postinDoc in listOfPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,Pab=trainNB0(trainMat,listClasses)
    testEntry=['love','my','dalmation']
    thisDoc=np.array(setOfWords2Vec(myVocabList,testEntry))
    print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)

In [8]:
testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1


## 4.5.4 准备数据：文档词袋模型

In [9]:
def bagOfWords2VecMN(vocabList,inputSet):
    returnVec=[0]*len(vocabList)
    for word in inputSet:
        if(word in vocabList):
            returnVec[vocabList.index(word)]+=1
        else:
            print 'the world :%s is not in my Vocabulary!'%word
    return returnVec

# 4.6 示例：使用朴素贝叶斯过滤垃圾邮件

## 4.6.1 准备数据：切分文本

In [10]:
import re
mySent='Is this your pen? I found it under the desk.'
regEx=re.compile('\\W*')
listOfTokens=regEx.split(mySent)
listOfTokens

['Is', 'this', 'your', 'pen', 'I', 'found', 'it', 'under', 'the', 'desk', '']

In [11]:
with open('6.txt') as fr:
    emailTest=fr.read()
listOfTokens=regEx.split(emailTest)
listOfTokens

['Hello',
 'Since',
 'you',
 'are',
 'an',
 'owner',
 'of',
 'at',
 'least',
 'one',
 'Google',
 'Groups',
 'group',
 'that',
 'uses',
 'the',
 'customized',
 'welcome',
 'message',
 'pages',
 'or',
 'files',
 'we',
 'are',
 'writing',
 'to',
 'inform',
 'you',
 'that',
 'we',
 'will',
 'no',
 'longer',
 'be',
 'supporting',
 'these',
 'features',
 'starting',
 'February',
 '2011',
 'We',
 'made',
 'this',
 'decision',
 'so',
 'that',
 'we',
 'can',
 'focus',
 'on',
 'improving',
 'the',
 'core',
 'functionalities',
 'of',
 'Google',
 'Groups',
 'mailing',
 'lists',
 'and',
 'forum',
 'discussions',
 'Instead',
 'of',
 'these',
 'features',
 'we',
 'encourage',
 'you',
 'to',
 'use',
 'products',
 'that',
 'are',
 'designed',
 'specifically',
 'for',
 'file',
 'storage',
 'and',
 'page',
 'creation',
 'such',
 'as',
 'Google',
 'Docs',
 'and',
 'Google',
 'Sites',
 'For',
 'example',
 'you',
 'can',
 'easily',
 'create',
 'your',
 'pages',
 'on',
 'Google',
 'Sites',
 'and',
 'share',


## 4.6.2 测试算法：使用朴素贝叶斯进行交叉验证

In [12]:
def textParse(bigString):
    regEx=re.compile('\\W*')
    listOfTokens=regEx.split(bigString)
    return [tok.lower() for tok in listOfTokens if len(tok)>2]

def spamTest():
    docList=[]; classList = []; fullText =[]
    
    for i in range(1,26):
        with open('email/spam/%d.txt' % i) as fr:
            emailTest=fr.read()
        wordList = textParse(emailTest)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        
        with open('email/ham/%d.txt' % i) as fr:
            emailTest=fr.read()
        wordList = textParse(emailTest)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
        
    vocabList = createVocabList(docList)
    trainingSet = range(50); testSet=[]
    
    for i in range(10):
        randIndex = int(np.random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
        
    trainMat=[]; trainClasses = []
    
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
        
    p0V,p1V,pSpam = trainNB0(np.array(trainMat),np.array(trainClasses))
    errorCount = 0
    
    for docIndex in testSet: 
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if(classifyNB(np.array(wordVector),p0V,p1V,pSpam) != classList[docIndex]):
            errorCount += 1
            print "classification error",docList[docIndex]
    print 'the error rate is: ',1.0*errorCount/len(testSet)

In [13]:
spamTest()

classification error ['benoit', 'mandelbrot', '1924', '2010', 'benoit', 'mandelbrot', '1924', '2010', 'wilmott', 'team', 'benoit', 'mandelbrot', 'the', 'mathematician', 'the', 'father', 'fractal', 'mathematics', 'and', 'advocate', 'more', 'sophisticated', 'modelling', 'quantitative', 'finance', 'died', '14th', 'october', '2010', 'aged', 'wilmott', 'magazine', 'has', 'often', 'featured', 'mandelbrot', 'his', 'ideas', 'and', 'the', 'work', 'others', 'inspired', 'his', 'fundamental', 'insights', 'you', 'must', 'logged', 'view', 'these', 'articles', 'from', 'past', 'issues', 'wilmott', 'magazine']
classification error ['scifinance', 'now', 'automatically', 'generates', 'gpu', 'enabled', 'pricing', 'risk', 'model', 'source', 'code', 'that', 'runs', '300x', 'faster', 'than', 'serial', 'code', 'using', 'new', 'nvidia', 'fermi', 'class', 'tesla', 'series', 'gpu', 'scifinance', 'derivatives', 'pricing', 'and', 'risk', 'model', 'development', 'tool', 'that', 'automatically', 'generates', 'and', 