In [5]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import re

In [8]:
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec

def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

def setOfWord2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print "the word: %s is not in my vocabulary!" % word
    return returnVec

# 将setOfWord2Vec 替换为 bagOfWords2VecMN才是完整的多项式模型
# 或者如《机器学习实战》中说的由词集模型，变为词袋模型，词集模型是介于伯努利模型与多项式模型之间的半吊子
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)      #change to ones() 
    p0Denom = 2.0
    p1Denom = 2.0                        #change to 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)          #change to log()
    p0Vect = np.log(p0Num/p0Denom)          #change to log()
    return p0Vect,p1Vect,pAbusive

In [10]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    #print vec2Classify.shape, p1Vec.shape
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)  # 对于文档中重复出现的词，在训练计算条件概率时，利用的是多项式模型，
                                                      # 但是词向量又利用的是伯努利的模型，艹
                                                      # 但是在测试时，测试样本重复出现的词，只算了一次
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0
    
def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWord2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWord2Vec(myVocabList, testEntry))
    print testEntry, 'classified as : ', classifyNB(thisDoc, p0V, p1V, pAb)
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWord2Vec(myVocabList, testEntry))
    print testEntry, 'classified as : ', classifyNB(thisDoc, p0V, p1V, pAb)

testingNB()

['love', 'my', 'dalmation'] classified as :  0
['stupid', 'garbage'] classified as :  1


In [12]:
# 应用 ：垃圾邮件分类
def textParse(bigString):
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) >2]

def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        wordList = textParse(open('.\\email\\spam\\%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        
        wordList = textParse(open('.\\email\\ham\\%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
        
    vocabList = createVocabList(docList)
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int( np.random.uniform(0, len(trainingSet)) )
        testSet.append(trainingSet[randIndex])
        del trainingSet[randIndex]
    
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWord2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    #print trainingSet, p0V, p1V
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWord2Vec(vocabList, docList[docIndex])
        if classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is : ', float(errorCount)/len(testSet)
        
spamTest()

[0, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34, 37, 38, 39, 40, 41, 43, 45, 46, 47, 48, 49] [-4.89485026 -5.81114099 -5.81114099 -5.81114099 -6.50428817 -6.50428817
 -5.81114099 -5.81114099 -5.81114099 -6.50428817 -5.81114099 -5.81114099
 -5.81114099 -5.81114099 -5.81114099 -5.81114099 -5.81114099 -6.50428817
 -5.81114099 -5.81114099 -6.50428817 -6.50428817 -5.81114099 -6.50428817
 -5.81114099 -5.81114099 -5.81114099 -6.50428817 -5.81114099 -5.81114099
 -5.81114099 -5.40567588 -5.81114099 -5.81114099 -5.81114099 -6.50428817
 -4.89485026 -6.50428817 -5.81114099 -5.81114099 -5.81114099 -6.50428817
 -5.81114099 -6.50428817 -5.81114099 -5.81114099 -6.50428817 -5.81114099
 -5.81114099 -5.81114099 -5.81114099 -5.81114099 -5.81114099 -5.81114099
 -6.50428817 -6.50428817 -6.50428817 -5.40567588 -5.11799381 -6.50428817
 -6.50428817 -5.81114099 -5.81114099 -6.50428817 -5.81114099 -6.50428817
 -5.81114099 -6.50428817 -5.81114099 -5.4056

In [9]:
np.array([2,3,4]) * np.array([3,4,5])

array([ 6, 12, 20])

In [11]:
np.matrix([2,3,4]).T * np.matrix([3,4,5])

matrix([[ 6,  8, 10],
        [ 9, 12, 15],
        [12, 16, 20]])

In [None]:
postingList, classVec = loadDataSet()
vocabSet = createVocabList(postingList)
train_word2vec = []
for post in postingList:
    train_word2vec.append(setOfWord2Vec(vocabSet, post))
p0Vect, p1Vect, pAbusive = trainNB0(train_word2vec, classVec)

In [None]:
trainMatrix = train_word2vec
trainCategory = classVec
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory) / float(numTrainDocs)
p0Num = np.zeros(numWords)
p1Num = np.zeros(numWords)      #change to ones() 
p0Denom = 0.0
p1Denom = 0.0  
for i in range(numTrainDocs):
    if trainCategory[i] == 1:
        p1Num += trainMatrix[i]
        p1Denom += sum(trainMatrix[i])
    else:
        p0Num += trainMatrix[i]
        p0Denom += sum(trainMatrix[i])
p1Vect = (p1Num/p1Denom)          #change to log()
p0Vect = (p0Num/p0Denom)

In [None]:
print vocabSet,len(vocabSet)
print trainMatrix[0]
print trainMatrix[2]
print trainMatrix[4]
print p0Num.astype(int).tolist()
print '\n'
print trainMatrix[1]
print trainMatrix[3]
print trainMatrix[5]
print p1Num.astype(int).tolist()

print sum(trainMatrix[0]), sum(trainMatrix[2]), sum(trainMatrix[4]), p0Denom
print sum(trainMatrix[1]), sum(trainMatrix[3]), sum(trainMatrix[5]), p1Denom
print p0Vect,sum(p0Vect)
print p1Vect,sum(p1Vect)

In [None]:
def my_trainNB(trainMatrix, trainCategory):
    num_samples = len(trainMatrix)
    num_features = len(trainMatrix[0])
    num_values_per_fea = 2  #分别为0 和 1
    num_class = 2 # len(set(trainCategory))
    num_pos = sum(trainCategory)
    num_neg = num_samples - num_pos
    p_pos = num_pos / float(num_samples) #正样本频率，即是侮辱性语言的频率
    p_neg = 1 - p_pos
    num_pos_fea_1 = np.zeros(num_features)
    num_neg_fea_1 = np.zeros(num_features)
    
    for i in range(num_samples):
        if trainCategory[i] == 1:
            num_pos_fea_1 += trainMatrix[i]
        elif trainCategory[i] == 0:
            num_neg_fea_1 += trainMatrix[i]
    p_pos_fea_1 = p1Num / num_pos  #在类别为正的时候，某个特征取值为1的概率，
    p_pos_fea_0 = np.ones(num_features) - p_pos_fea_1 #在类别为正的时候，某个特征取值为0的概率
    p_neg_fea_1 = p0Num / num_neg  #在类别为负的时候，某个特征取值为1的概率，
    p_neg_fea_0 = np.ones(num_features) - p_neg_fea_1 #在类别为负的时候，某个特征取值为0的概率
    print num_pos, num_neg, p_pos, p_neg
    
    print trainMatrix[1]
    print trainMatrix[3]
    print trainMatrix[5]
    print num_pos_fea_1.astype(int).tolist()
    print p_pos_fea_1
    print p_pos_fea_0
    print '\n'
    print trainMatrix[0]
    print trainMatrix[2]
    print trainMatrix[4]
    print num_neg_fea_1.astype(int).tolist()
    print p_neg_fea_1
    print p_neg_fea_0
            
    
my_trainNB(trainMatrix, trainCategory)    