# 通过朴素贝叶斯算法进行文本分类

In [1]:
import os
import pandas as pd
import jieba
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#定义分次函数
def cut_words(file_path):
    """
    对文本进行切词
    :param file_path: txt文本路径
    :return: 用空格分词的字符串
    """
    text_with_spaces = ''
    text=open(file_path, 'r', encoding='gb18030').read()
    textcut = jieba.cut(text)
    for word in textcut:
        text_with_spaces += word + ' '
    return text_with_spaces

In [3]:
#文件加载函数
def loadfile(file_dir):
    """
    将路径下的所有文件加载
    :param file_dir: 保存txt文件目录
    :return: 分词后的文档列表和标签
    注意每个文档对应一个label标签
    """
    words_list = []
    labels_list = []
    for root, dirs, files in os.walk(file_dir):
        for file in files:
            label=root.split('/')[-1]
            file_path = os.path.join(root, file)
            words_list.append(cut_words(file_path))
            labels_list.append(label) 
    return words_list, labels_list     
        
#     for file in file_list:
#         file_path = file_dir + '/' + file
#         words_list.append(cut_words(file_path))
#         labels_list.append(label)                                                                                                                 
#     return words_list, labels_list

In [4]:
#第一步：对文档进行分词
train_documents, train_labels = loadfile(r'../data/text classification/train/')
test_documents, test_labels = loadfile(r'../data/text classification/test/')
# print(train_documents)
# print(train_labels)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Administrator\AppData\Local\Temp\jieba.cache
Loading model cost 0.864 seconds.
Prefix dict has been built succesfully.


In [5]:

#第二步：加载停用词
STOP_WORDS = [line.strip() for line in open(r'../data/text classification/stop/stopword.txt' ,errors='ignore',encoding='utf-8').readlines()]
# print(STOP_WORDS)

In [6]:

#第三步：计算单词的权重
tf = TfidfVectorizer(stop_words=STOP_WORDS, max_df=0.5)
train_features = tf.fit_transform(train_documents)

#第四步：生成朴素贝叶斯分类器
clf = MultinomialNB(alpha=0.001).fit(train_features, train_labels)

#第五步：使用生成的分类器做预测
test_tf = TfidfVectorizer(stop_words=STOP_WORDS, max_df=0.5, vocabulary=tf.vocabulary_)
test_features = test_tf.fit_transform(test_documents)

predict_labels = clf.predict(test_features)

#第六步：计算准确率
print(metrics.accuracy_score(test_labels, predict_labels))

  'stop_words.' % sorted(inconsistent))


0.91


  'stop_words.' % sorted(inconsistent))


# 单独对某个文档进行预测

In [7]:
file=r'../data/text classification/test/文学/780.txt'
#过程同样是需要先进行分词，然后计算TF-TDF值
testword_list=[]
testword_list.append(cut_words(file))

test = TfidfVectorizer(stop_words=STOP_WORDS, max_df=0.5, vocabulary=tf.vocabulary_)
features = test.fit_transform(testword_list)
predict = clf.predict(features)
print(predict)

['文学']
