In [1]:
'''
加载训练数据, postingList是所有的训练集, 每一个列表代表一条言论, 一共有8条言论
            classVec代表每一条言论的类别, 0是正常, 1是有侮辱性
            返回 言论和类别
'''
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], 
                  ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], 
                  ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'hime'], 
                  ['stop', 'posting', 'stupid', 'worthless', 'garbage'], 
                  ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], 
                  ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]
    return postingList, classVec

In [2]:
'''
创建词汇表, 就是把这个文档中所有的单词不重复的放在一个列表里面
'''
def createVocabList(dataSet):
    vocabSet = set([])           # 新建一个set集合, 保证里面的数据不重复
    for document in dataSet:     # 获得每一个文档
        vocabSet = vocabSet | set(document)   # 将这个文档去重之后和词汇表求并集
    return list(vocabSet)                     # 将词汇表转换为列表返回

In [3]:
'''
vocabList是由createVocabList产生的词汇表
inputSet是输入新的文档
'''
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)  # 生成一个全0列表, 个数为输入文档的长度
    for word in inputSet:             # 遍历输入文档中的每一个单词
        if word in vocabList:         # 如果这个单词在词汇表中
            returnVec[vocabList.index(word)] = 1  # 列表中该位置置1
        else:                                     # 否则依然为0
            print("the word %s is not in my Vocabulary" % word) 
    return returnVec

In [4]:
listOPosts, listClasses = loadDataSet()

In [5]:
listOPosts, listClasses

([['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
  ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
  ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'hime'],
  ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
  ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
  ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']],
 [0, 1, 0, 1, 0, 1])

In [6]:
myVocabList = createVocabList(listOPosts)

In [7]:
 print(myVocabList)

['problems', 'dog', 'please', 'is', 'him', 'stop', 'licks', 'not', 'food', 'maybe', 'my', 'steak', 'stupid', 'love', 'posting', 'to', 'hime', 'park', 'garbage', 'ate', 'take', 'buying', 'I', 'cute', 'mr', 'quit', 'help', 'how', 'dalmation', 'has', 'worthless', 'flea', 'so']


In [8]:
print(listOPosts[0])
print(setOfWords2Vec(myVocabList, listOPosts[0]))

['my', 'dog', 'has', 'flea', 'problems', 'help', 'please']
[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0]


In [9]:
from numpy import *

In [10]:
'''
计算先验概率
trainMatrix: 词向量矩阵
trainCategory: 每一个词向量的类别
返回每一个单词属于侮辱性和非侮辱性词汇的先验概率, 以及训练集包含侮辱性文档的概率
'''
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)    # 由训练集生成的词向量矩阵
    numWords = len(trainMatrix[0])     # 每一个词向量的长度
    pAbusive  = sum(trainCategory) / float(numTrainDocs)    # 计算侮辱性文档的先验概率
    p0Num = ones(numWords)             # 生成全1 array, 长度为词向量的长度, 用于统计每个单词在整个矩阵中出现的次数(分子)
    p1Num = ones(numWords)
    p0Denom = 2.0                      # 初始化为2(分母), 拉普拉斯平滑
    p1Denom = 2.0
    for i in range(numTrainDocs):      # 遍历每一个词向量
        if trainCategory[i] == 1:      # 如果该词向量的类别为1
            p1Num += trainMatrix[i]    # 计算P(x0) P(x1) P(xn)
            p1Denom += 1               # 统计侮辱性文档的个数
        else:
            p0Num += trainMatrix[i]    # 计算P(x0) P(x1) P(xn)
            p0Denom += 1               # 统计非侮辱性的文档个数
    p0Vect = log(p0Num / p0Denom)      # 计算P(x0|0) P(x1|0) P(xn|0)
    p1Vect = log(p1Num / p1Denom)      # 计算P(x0|1) P(x1|1) P(xn|1)   取对数是防止多个小数相乘出现下溢
    return p0Vect, p1Vect, pAbusive

In [11]:
'''
制作词向量矩阵
将每一个文档转换为词向量, 然后放入矩阵中
'''
trainMat = []
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))

In [12]:
p0V, p1V, pAb = trainNB0(trainMat, listClasses)
p0V, p1V, pAb

(array([-0.91629073, -0.91629073, -0.91629073, -0.91629073, -0.91629073,
        -0.91629073, -0.91629073, -1.60943791, -1.60943791, -1.60943791,
        -0.22314355, -0.91629073, -1.60943791, -0.91629073, -1.60943791,
        -0.91629073, -0.91629073, -1.60943791, -1.60943791, -0.91629073,
        -1.60943791, -1.60943791, -0.91629073, -0.91629073, -0.91629073,
        -1.60943791, -0.91629073, -0.91629073, -0.91629073, -0.91629073,
        -1.60943791, -0.91629073, -0.91629073]),
 array([-1.60943791, -0.51082562, -1.60943791, -1.60943791, -0.91629073,
        -0.91629073, -1.60943791, -0.91629073, -0.91629073, -0.91629073,
        -1.60943791, -1.60943791, -0.22314355, -1.60943791, -0.91629073,
        -0.91629073, -1.60943791, -0.91629073, -0.91629073, -1.60943791,
        -0.91629073, -0.91629073, -1.60943791, -1.60943791, -1.60943791,
        -0.91629073, -1.60943791, -1.60943791, -1.60943791, -1.60943791,
        -0.51082562, -1.60943791, -1.60943791]),
 0.5)

In [13]:
'''
制作贝叶斯分类器
vec2Classify: 测试样本的词向量
p0Vec: P(x0|Y=0) P(x1|Y=0) P(xn|Y=0)
p1Vec: P(x0|Y=1) P(x1|Y=1) P(xn|Y=1)
pClass1: P(y)
# log(P(x1|1)*P(x2|1)*P(x3|1)P(1))=log(P(x1|1))+log(P(x2|1))+log(P(1))
'''
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)       
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

In [14]:
'''
测试贝叶斯分类器
'''
def testingNB():
    listOPosts, listClasses =  loadDataSet()       # 加载数据
    myVocabList = createVocabList(listOPosts)      # 制作词汇表
    trainMat = []                                  # 制作训练集词向量
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(trainMat, listClasses) # 计算先验概率
    testEntry = ['love', 'my', 'dalmation']         # 测试文档1
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as :', classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage', 'stupid']     # 测试文档2
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as : ', classifyNB(thisDoc, p0V, p1V, pAb))