## 采用`Laplace`平滑的`Naive Bayes`公式



$$ 
    P(X^{(i)} = x_{ij} | Y = y_k) = \frac{N(X^{(i)} = x_{ij}, y = y_{k}) + λ}{N(Y = y_k) + λJ}
$$

`J`为`X(i)`的所有可能取值的个数即维度，`λ`为平滑的程度，通常取`λ=1`，此时该平滑称为拉普拉斯平滑。

In [199]:
import pickle
from sklearn.datasets.base import Bunch

In [200]:
with open('./DB/matrix/BOW.pkl', 'rb') as f:
    Set = pickle.load(f)

In [201]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import Pipeline

## 卡方检验进行降维
## 维度选择1万

In [202]:
# 维度取10000
scale = 10000
Matrix = Pipeline([('max', MaxAbsScaler()), ('chi2', SelectKBest(chi2, k=scale))]).fit_transform(Set.matrix, Set.y_target)
y_target = Set.y_target

In [203]:
from sklearn.feature_extraction.text import TfidfTransformer
Matrix = MaxAbsScaler().fit(Matrix).inverse_transform(Matrix)
Maxtrix = Pipeline([('tfidf', TfidfTransformer()), ('max', MaxAbsScaler())]).fit_transform(Matrix)

In [204]:
from sklearn.cross_validation import train_test_split

In [205]:
trainMatrix, testMatrix, trainTarget, testTarget = train_test_split(Matrix, y_target, train_size = 0.3, stratify = y_target)

In [206]:
import numpy as np
import scipy.sparse

In [207]:
import time
t_beg = time.time()

In [208]:
Y_PreP = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
for i in range(len(trainTarget)):
    Y_PreP[trainTarget[i]] += trainMatrix.getrow(i).sum()
Y_PreP

[84410.081028610744,
 76856.95512371567,
 66439.278759946639,
 92252.112222721495,
 54772.401197431522,
 114736.44869584768,
 53852.748539911292,
 52350.287995033184,
 49817.865817728591,
 126943.28356739019,
 90895.511142668256]

In [209]:
NBmach = np.zeros([11, scale], dtype=float)

In [210]:
for key, value in trainMatrix.todok().items():
    NBmach[trainTarget[key[0]], key[1]]  += (value + 1) / (scale + Y_PreP[trainTarget[key[0]]])
t_end = time.time()
print("train time: %.2lf" % (t_end - t_beg), NBmach.shape)

(11, 10000)
train time: 100.43 (11, 10000)


In [211]:
res = []
import time
beg = time.time()
for i in range(len(testTarget)):
    back = -1
    tmp = -1
    for j, val in enumerate(np.dot(NBmach, testMatrix.getrow(i).toarray().T)):
        if val > tmp:
            tmp = val
            back = j
    res.append(back)

end = time.time()
print("predict time:", end - beg)

predict time: 264.8523299694061


In [212]:
len(res)

902869

In [213]:
len(testTarget)

902869

In [214]:
label = ['财经', '股票', '教育', '科技', '时政','体育','游戏','娱乐','汽车','社会','军事']
from sklearn.metrics import classification_report
prt_res = classification_report(testTarget, res, target_names=label)
print(prt_res)

precision    recall  f1-score   support

         财经       0.64      0.17      0.26     72456
         股票       0.64      0.96      0.77    108079
         教育       0.82      0.72      0.77     70945
         科技       0.91      0.71      0.80    114050
         时政       0.82      0.65      0.73     73408
         体育       0.98      0.94      0.96     92122
         游戏       0.86      0.96      0.91     73959
         娱乐       0.89      0.91      0.90     74200
         汽车       0.82      0.95      0.88     71192
         社会       0.92      1.00      0.96     81826
         军事       0.72      1.00      0.84     70632

avg / total       0.82      0.82      0.80    902869



In [215]:
end = time.time()
print("cost time: %.2lf" % (end - beg))

cost time: 266.80


In [216]:
with open('result/my_bayes.csv', 'w', encoding='utf-8') as f:
    f.write(prt_res)
    f.close()