# 使用`TF-IDF`值代替`DF`作为朴素贝叶斯的向量值
## 提升了较差类别的性能，同时也拉低了好类别的性能

In [9]:
import pickle
from sklearn.datasets.base import Bunch

def readObj(path):
    with open(path, 'rb') as f:
        obj = pickle.load(f)
        f.close()
        return obj

Set = readObj('DB/matrix/BOW.pkl')

In [10]:
def featureSelection(matrix, y_target):
    from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
    from sklearn.feature_selection import SelectKBest, chi2
    maxAbsMethod = MaxAbsScaler()
    X = maxAbsMethod.fit_transform(matrix)
    KbestMethod = SelectKBest(chi2, k=50000)
    reduce = KbestMethod.fit_transform(X, y_target)
    print(reduce.shape)
    return reduce

Matrix = featureSelection(Set.matrix, Set.y_target)

(1289812, 50000)


In [11]:
from sklearn.model_selection import train_test_split

trainMatrix, testMatrix, trainTarget, testTarget = train_test_split(Matrix, Set.y_target, train_size = 0.3, stratify = Set.y_target)

print(trainMatrix.shape, testMatrix.shape, len(trainTarget), len(testTarget))

(386943, 50000) (902869, 50000) 386943 902869


In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
# 注意alpha参数的设置是收敛阈值，小于alpha的时候，就停止训练
pipe = Pipeline([('tfidf', TfidfTransformer()), ('MnNB', MultinomialNB(0.1))]).fit(trainMatrix, trainTarget)

In [13]:
predict_tfidf = pipe.predict(testMatrix)
predict_df = MultinomialNB(alpha=0.1).fit(trainMatrix, trainTarget).predict(testMatrix)

In [14]:
print(predict_tfidf.shape, predict_df.shape)

(902869,) (902869,)


In [15]:
label = ['财经', '股票', '教育', '科技', '时政','体育','游戏','娱乐','汽车','社会','军事']

In [20]:
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

res_tfidf = classification_report(testTarget, predict_tfidf, target_names=label)
print(res_tfidf)

precision    recall  f1-score   support

         财经       0.80      0.80      0.80     72456
         股票       0.91      0.91      0.91    108079
         教育       0.90      0.87      0.88     70945
         科技       0.93      0.85      0.89    114050
         时政       0.85      0.77      0.81     73408
         体育       0.98      0.98      0.98     92122
         游戏       0.96      0.95      0.95     73959
         娱乐       0.91      0.96      0.93     74200
         汽车       0.95      0.91      0.93     71192
         社会       0.85      0.99      0.91     81826
         军事       0.91      0.96      0.93     70632

avg / total       0.91      0.90      0.90    902869



In [21]:
res_df = classification_report(testTarget, predict_df, target_names=label)
print(res_df)

precision    recall  f1-score   support

         财经       0.82      0.75      0.78     72456
         股票       0.86      0.94      0.89    108079
         教育       0.92      0.84      0.88     70945
         科技       0.85      0.87      0.86    114050
         时政       0.86      0.77      0.81     73408
         体育       0.98      0.98      0.98     92122
         游戏       0.97      0.95      0.96     73959
         娱乐       0.92      0.96      0.94     74200
         汽车       0.95      0.83      0.89     71192
         社会       0.87      0.99      0.93     81826
         军事       0.93      0.98      0.96     70632

avg / total       0.90      0.90      0.90    902869



In [22]:
with open('result/SK_TFIDFNB.csv', 'w', encoding='utf-8') as f:
    f.write(res_tfidf)
    f.close()

In [23]:
with open('result/SK_DFNB.csv', 'w', encoding='utf-8') as f:
    f.write(res_tfidf)
    f.close()